1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include <linux/namei.h> 18 #include "misc.h" 19 #include "ctree.h" 20 #include "extent_map.h" 21 #include "disk-io.h" 22 #include "transaction.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "async-thread.h" 27 #include "check-integrity.h" 28 #include "rcu-string.h" 29 #include "dev-replace.h" 30 #include "sysfs.h" 31 #include "tree-checker.h" 32 #include "space-info.h" 33 #include "block-group.h" 34 #include "discard.h" 35 #include "zoned.h" 36 37 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 38 BTRFS_BLOCK_GROUP_RAID10 | \ 39 BTRFS_BLOCK_GROUP_RAID56_MASK) 40 41 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 42 [BTRFS_RAID_RAID10] = { 43 .sub_stripes = 2, 44 .dev_stripes = 1, 45 .devs_max = 0, /* 0 == as many as possible */ 46 .devs_min = 2, 47 .tolerated_failures = 1, 48 .devs_increment = 2, 49 .ncopies = 2, 50 .nparity = 0, 51 .raid_name = "raid10", 52 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 53 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 54 }, 55 [BTRFS_RAID_RAID1] = { 56 .sub_stripes = 1, 57 .dev_stripes = 1, 58 .devs_max = 2, 59 .devs_min = 2, 60 .tolerated_failures = 1, 61 .devs_increment = 2, 62 .ncopies = 2, 63 .nparity = 0, 64 .raid_name = "raid1", 65 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 66 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 67 }, 68 [BTRFS_RAID_RAID1C3] = { 69 .sub_stripes = 1, 70 .dev_stripes = 1, 71 .devs_max = 3, 72 .devs_min = 3, 73 .tolerated_failures = 2, 74 .devs_increment = 3, 75 .ncopies = 3, 76 .nparity = 0, 77 .raid_name = "raid1c3", 78 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 79 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 80 }, 81 [BTRFS_RAID_RAID1C4] = { 82 .sub_stripes = 1, 83 .dev_stripes = 1, 84 .devs_max = 4, 85 .devs_min = 4, 86 .tolerated_failures = 3, 87 .devs_increment = 4, 88 .ncopies = 4, 89 .nparity = 0, 90 .raid_name = "raid1c4", 91 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 92 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 93 }, 94 [BTRFS_RAID_DUP] = { 95 .sub_stripes = 1, 96 .dev_stripes = 2, 97 .devs_max = 1, 98 .devs_min = 1, 99 .tolerated_failures = 0, 100 .devs_increment = 1, 101 .ncopies = 2, 102 .nparity = 0, 103 .raid_name = "dup", 104 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 105 .mindev_error = 0, 106 }, 107 [BTRFS_RAID_RAID0] = { 108 .sub_stripes = 1, 109 .dev_stripes = 1, 110 .devs_max = 0, 111 .devs_min = 1, 112 .tolerated_failures = 0, 113 .devs_increment = 1, 114 .ncopies = 1, 115 .nparity = 0, 116 .raid_name = "raid0", 117 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 118 .mindev_error = 0, 119 }, 120 [BTRFS_RAID_SINGLE] = { 121 .sub_stripes = 1, 122 .dev_stripes = 1, 123 .devs_max = 1, 124 .devs_min = 1, 125 .tolerated_failures = 0, 126 .devs_increment = 1, 127 .ncopies = 1, 128 .nparity = 0, 129 .raid_name = "single", 130 .bg_flag = 0, 131 .mindev_error = 0, 132 }, 133 [BTRFS_RAID_RAID5] = { 134 .sub_stripes = 1, 135 .dev_stripes = 1, 136 .devs_max = 0, 137 .devs_min = 2, 138 .tolerated_failures = 1, 139 .devs_increment = 1, 140 .ncopies = 1, 141 .nparity = 1, 142 .raid_name = "raid5", 143 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 144 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 145 }, 146 [BTRFS_RAID_RAID6] = { 147 .sub_stripes = 1, 148 .dev_stripes = 1, 149 .devs_max = 0, 150 .devs_min = 3, 151 .tolerated_failures = 2, 152 .devs_increment = 1, 153 .ncopies = 1, 154 .nparity = 2, 155 .raid_name = "raid6", 156 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 157 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 158 }, 159 }; 160 161 /* 162 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 163 * can be used as index to access btrfs_raid_array[]. 164 */ 165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 166 { 167 if (flags & BTRFS_BLOCK_GROUP_RAID10) 168 return BTRFS_RAID_RAID10; 169 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 170 return BTRFS_RAID_RAID1; 171 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 172 return BTRFS_RAID_RAID1C3; 173 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 174 return BTRFS_RAID_RAID1C4; 175 else if (flags & BTRFS_BLOCK_GROUP_DUP) 176 return BTRFS_RAID_DUP; 177 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 178 return BTRFS_RAID_RAID0; 179 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 180 return BTRFS_RAID_RAID5; 181 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 182 return BTRFS_RAID_RAID6; 183 184 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 185 } 186 187 const char *btrfs_bg_type_to_raid_name(u64 flags) 188 { 189 const int index = btrfs_bg_flags_to_raid_index(flags); 190 191 if (index >= BTRFS_NR_RAID_TYPES) 192 return NULL; 193 194 return btrfs_raid_array[index].raid_name; 195 } 196 197 /* 198 * Fill @buf with textual description of @bg_flags, no more than @size_buf 199 * bytes including terminating null byte. 200 */ 201 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 202 { 203 int i; 204 int ret; 205 char *bp = buf; 206 u64 flags = bg_flags; 207 u32 size_bp = size_buf; 208 209 if (!flags) { 210 strcpy(bp, "NONE"); 211 return; 212 } 213 214 #define DESCRIBE_FLAG(flag, desc) \ 215 do { \ 216 if (flags & (flag)) { \ 217 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 218 if (ret < 0 || ret >= size_bp) \ 219 goto out_overflow; \ 220 size_bp -= ret; \ 221 bp += ret; \ 222 flags &= ~(flag); \ 223 } \ 224 } while (0) 225 226 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 227 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 228 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 229 230 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 231 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 232 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 233 btrfs_raid_array[i].raid_name); 234 #undef DESCRIBE_FLAG 235 236 if (flags) { 237 ret = snprintf(bp, size_bp, "0x%llx|", flags); 238 size_bp -= ret; 239 } 240 241 if (size_bp < size_buf) 242 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 243 244 /* 245 * The text is trimmed, it's up to the caller to provide sufficiently 246 * large buffer 247 */ 248 out_overflow:; 249 } 250 251 static int init_first_rw_device(struct btrfs_trans_handle *trans); 252 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 253 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 254 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 255 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 256 enum btrfs_map_op op, 257 u64 logical, u64 *length, 258 struct btrfs_io_context **bioc_ret, 259 int mirror_num, int need_raid_map); 260 261 /* 262 * Device locking 263 * ============== 264 * 265 * There are several mutexes that protect manipulation of devices and low-level 266 * structures like chunks but not block groups, extents or files 267 * 268 * uuid_mutex (global lock) 269 * ------------------------ 270 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 271 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 272 * device) or requested by the device= mount option 273 * 274 * the mutex can be very coarse and can cover long-running operations 275 * 276 * protects: updates to fs_devices counters like missing devices, rw devices, 277 * seeding, structure cloning, opening/closing devices at mount/umount time 278 * 279 * global::fs_devs - add, remove, updates to the global list 280 * 281 * does not protect: manipulation of the fs_devices::devices list in general 282 * but in mount context it could be used to exclude list modifications by eg. 283 * scan ioctl 284 * 285 * btrfs_device::name - renames (write side), read is RCU 286 * 287 * fs_devices::device_list_mutex (per-fs, with RCU) 288 * ------------------------------------------------ 289 * protects updates to fs_devices::devices, ie. adding and deleting 290 * 291 * simple list traversal with read-only actions can be done with RCU protection 292 * 293 * may be used to exclude some operations from running concurrently without any 294 * modifications to the list (see write_all_supers) 295 * 296 * Is not required at mount and close times, because our device list is 297 * protected by the uuid_mutex at that point. 298 * 299 * balance_mutex 300 * ------------- 301 * protects balance structures (status, state) and context accessed from 302 * several places (internally, ioctl) 303 * 304 * chunk_mutex 305 * ----------- 306 * protects chunks, adding or removing during allocation, trim or when a new 307 * device is added/removed. Additionally it also protects post_commit_list of 308 * individual devices, since they can be added to the transaction's 309 * post_commit_list only with chunk_mutex held. 310 * 311 * cleaner_mutex 312 * ------------- 313 * a big lock that is held by the cleaner thread and prevents running subvolume 314 * cleaning together with relocation or delayed iputs 315 * 316 * 317 * Lock nesting 318 * ============ 319 * 320 * uuid_mutex 321 * device_list_mutex 322 * chunk_mutex 323 * balance_mutex 324 * 325 * 326 * Exclusive operations 327 * ==================== 328 * 329 * Maintains the exclusivity of the following operations that apply to the 330 * whole filesystem and cannot run in parallel. 331 * 332 * - Balance (*) 333 * - Device add 334 * - Device remove 335 * - Device replace (*) 336 * - Resize 337 * 338 * The device operations (as above) can be in one of the following states: 339 * 340 * - Running state 341 * - Paused state 342 * - Completed state 343 * 344 * Only device operations marked with (*) can go into the Paused state for the 345 * following reasons: 346 * 347 * - ioctl (only Balance can be Paused through ioctl) 348 * - filesystem remounted as read-only 349 * - filesystem unmounted and mounted as read-only 350 * - system power-cycle and filesystem mounted as read-only 351 * - filesystem or device errors leading to forced read-only 352 * 353 * The status of exclusive operation is set and cleared atomically. 354 * During the course of Paused state, fs_info::exclusive_operation remains set. 355 * A device operation in Paused or Running state can be canceled or resumed 356 * either by ioctl (Balance only) or when remounted as read-write. 357 * The exclusive status is cleared when the device operation is canceled or 358 * completed. 359 */ 360 361 DEFINE_MUTEX(uuid_mutex); 362 static LIST_HEAD(fs_uuids); 363 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 364 { 365 return &fs_uuids; 366 } 367 368 /* 369 * alloc_fs_devices - allocate struct btrfs_fs_devices 370 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 371 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 372 * 373 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 374 * The returned struct is not linked onto any lists and can be destroyed with 375 * kfree() right away. 376 */ 377 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 378 const u8 *metadata_fsid) 379 { 380 struct btrfs_fs_devices *fs_devs; 381 382 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 383 if (!fs_devs) 384 return ERR_PTR(-ENOMEM); 385 386 mutex_init(&fs_devs->device_list_mutex); 387 388 INIT_LIST_HEAD(&fs_devs->devices); 389 INIT_LIST_HEAD(&fs_devs->alloc_list); 390 INIT_LIST_HEAD(&fs_devs->fs_list); 391 INIT_LIST_HEAD(&fs_devs->seed_list); 392 if (fsid) 393 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 394 395 if (metadata_fsid) 396 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 397 else if (fsid) 398 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 399 400 return fs_devs; 401 } 402 403 void btrfs_free_device(struct btrfs_device *device) 404 { 405 WARN_ON(!list_empty(&device->post_commit_list)); 406 rcu_string_free(device->name); 407 extent_io_tree_release(&device->alloc_state); 408 bio_put(device->flush_bio); 409 btrfs_destroy_dev_zone_info(device); 410 kfree(device); 411 } 412 413 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 414 { 415 struct btrfs_device *device; 416 WARN_ON(fs_devices->opened); 417 while (!list_empty(&fs_devices->devices)) { 418 device = list_entry(fs_devices->devices.next, 419 struct btrfs_device, dev_list); 420 list_del(&device->dev_list); 421 btrfs_free_device(device); 422 } 423 kfree(fs_devices); 424 } 425 426 void __exit btrfs_cleanup_fs_uuids(void) 427 { 428 struct btrfs_fs_devices *fs_devices; 429 430 while (!list_empty(&fs_uuids)) { 431 fs_devices = list_entry(fs_uuids.next, 432 struct btrfs_fs_devices, fs_list); 433 list_del(&fs_devices->fs_list); 434 free_fs_devices(fs_devices); 435 } 436 } 437 438 static noinline struct btrfs_fs_devices *find_fsid( 439 const u8 *fsid, const u8 *metadata_fsid) 440 { 441 struct btrfs_fs_devices *fs_devices; 442 443 ASSERT(fsid); 444 445 /* Handle non-split brain cases */ 446 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 447 if (metadata_fsid) { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 449 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 450 BTRFS_FSID_SIZE) == 0) 451 return fs_devices; 452 } else { 453 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 454 return fs_devices; 455 } 456 } 457 return NULL; 458 } 459 460 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 461 struct btrfs_super_block *disk_super) 462 { 463 464 struct btrfs_fs_devices *fs_devices; 465 466 /* 467 * Handle scanned device having completed its fsid change but 468 * belonging to a fs_devices that was created by first scanning 469 * a device which didn't have its fsid/metadata_uuid changed 470 * at all and the CHANGING_FSID_V2 flag set. 471 */ 472 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 473 if (fs_devices->fsid_change && 474 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 475 BTRFS_FSID_SIZE) == 0 && 476 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 477 BTRFS_FSID_SIZE) == 0) { 478 return fs_devices; 479 } 480 } 481 /* 482 * Handle scanned device having completed its fsid change but 483 * belonging to a fs_devices that was created by a device that 484 * has an outdated pair of fsid/metadata_uuid and 485 * CHANGING_FSID_V2 flag set. 486 */ 487 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 488 if (fs_devices->fsid_change && 489 memcmp(fs_devices->metadata_uuid, 490 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 491 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 492 BTRFS_FSID_SIZE) == 0) { 493 return fs_devices; 494 } 495 } 496 497 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 498 } 499 500 501 static int 502 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 503 int flush, struct block_device **bdev, 504 struct btrfs_super_block **disk_super) 505 { 506 int ret; 507 508 *bdev = blkdev_get_by_path(device_path, flags, holder); 509 510 if (IS_ERR(*bdev)) { 511 ret = PTR_ERR(*bdev); 512 goto error; 513 } 514 515 if (flush) 516 sync_blockdev(*bdev); 517 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 518 if (ret) { 519 blkdev_put(*bdev, flags); 520 goto error; 521 } 522 invalidate_bdev(*bdev); 523 *disk_super = btrfs_read_dev_super(*bdev); 524 if (IS_ERR(*disk_super)) { 525 ret = PTR_ERR(*disk_super); 526 blkdev_put(*bdev, flags); 527 goto error; 528 } 529 530 return 0; 531 532 error: 533 *bdev = NULL; 534 return ret; 535 } 536 537 static bool device_path_matched(const char *path, struct btrfs_device *device) 538 { 539 int found; 540 541 rcu_read_lock(); 542 found = strcmp(rcu_str_deref(device->name), path); 543 rcu_read_unlock(); 544 545 return found == 0; 546 } 547 548 /* 549 * Search and remove all stale (devices which are not mounted) devices. 550 * When both inputs are NULL, it will search and release all stale devices. 551 * path: Optional. When provided will it release all unmounted devices 552 * matching this path only. 553 * skip_dev: Optional. Will skip this device when searching for the stale 554 * devices. 555 * Return: 0 for success or if @path is NULL. 556 * -EBUSY if @path is a mounted device. 557 * -ENOENT if @path does not match any device in the list. 558 */ 559 static int btrfs_free_stale_devices(const char *path, 560 struct btrfs_device *skip_device) 561 { 562 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 563 struct btrfs_device *device, *tmp_device; 564 int ret = 0; 565 566 lockdep_assert_held(&uuid_mutex); 567 568 if (path) 569 ret = -ENOENT; 570 571 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 572 573 mutex_lock(&fs_devices->device_list_mutex); 574 list_for_each_entry_safe(device, tmp_device, 575 &fs_devices->devices, dev_list) { 576 if (skip_device && skip_device == device) 577 continue; 578 if (path && !device->name) 579 continue; 580 if (path && !device_path_matched(path, device)) 581 continue; 582 if (fs_devices->opened) { 583 /* for an already deleted device return 0 */ 584 if (path && ret != 0) 585 ret = -EBUSY; 586 break; 587 } 588 589 /* delete the stale device */ 590 fs_devices->num_devices--; 591 list_del(&device->dev_list); 592 btrfs_free_device(device); 593 594 ret = 0; 595 } 596 mutex_unlock(&fs_devices->device_list_mutex); 597 598 if (fs_devices->num_devices == 0) { 599 btrfs_sysfs_remove_fsid(fs_devices); 600 list_del(&fs_devices->fs_list); 601 free_fs_devices(fs_devices); 602 } 603 } 604 605 return ret; 606 } 607 608 /* 609 * This is only used on mount, and we are protected from competing things 610 * messing with our fs_devices by the uuid_mutex, thus we do not need the 611 * fs_devices->device_list_mutex here. 612 */ 613 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 614 struct btrfs_device *device, fmode_t flags, 615 void *holder) 616 { 617 struct request_queue *q; 618 struct block_device *bdev; 619 struct btrfs_super_block *disk_super; 620 u64 devid; 621 int ret; 622 623 if (device->bdev) 624 return -EINVAL; 625 if (!device->name) 626 return -EINVAL; 627 628 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 629 &bdev, &disk_super); 630 if (ret) 631 return ret; 632 633 devid = btrfs_stack_device_id(&disk_super->dev_item); 634 if (devid != device->devid) 635 goto error_free_page; 636 637 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 638 goto error_free_page; 639 640 device->generation = btrfs_super_generation(disk_super); 641 642 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 643 if (btrfs_super_incompat_flags(disk_super) & 644 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 645 pr_err( 646 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 647 goto error_free_page; 648 } 649 650 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 651 fs_devices->seeding = true; 652 } else { 653 if (bdev_read_only(bdev)) 654 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 655 else 656 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 657 } 658 659 q = bdev_get_queue(bdev); 660 if (!blk_queue_nonrot(q)) 661 fs_devices->rotating = true; 662 663 device->bdev = bdev; 664 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 665 device->mode = flags; 666 667 fs_devices->open_devices++; 668 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 669 device->devid != BTRFS_DEV_REPLACE_DEVID) { 670 fs_devices->rw_devices++; 671 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 672 } 673 btrfs_release_disk_super(disk_super); 674 675 return 0; 676 677 error_free_page: 678 btrfs_release_disk_super(disk_super); 679 blkdev_put(bdev, flags); 680 681 return -EINVAL; 682 } 683 684 /* 685 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 686 * being created with a disk that has already completed its fsid change. Such 687 * disk can belong to an fs which has its FSID changed or to one which doesn't. 688 * Handle both cases here. 689 */ 690 static struct btrfs_fs_devices *find_fsid_inprogress( 691 struct btrfs_super_block *disk_super) 692 { 693 struct btrfs_fs_devices *fs_devices; 694 695 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 696 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 697 BTRFS_FSID_SIZE) != 0 && 698 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 699 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 700 return fs_devices; 701 } 702 } 703 704 return find_fsid(disk_super->fsid, NULL); 705 } 706 707 708 static struct btrfs_fs_devices *find_fsid_changed( 709 struct btrfs_super_block *disk_super) 710 { 711 struct btrfs_fs_devices *fs_devices; 712 713 /* 714 * Handles the case where scanned device is part of an fs that had 715 * multiple successful changes of FSID but currently device didn't 716 * observe it. Meaning our fsid will be different than theirs. We need 717 * to handle two subcases : 718 * 1 - The fs still continues to have different METADATA/FSID uuids. 719 * 2 - The fs is switched back to its original FSID (METADATA/FSID 720 * are equal). 721 */ 722 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 723 /* Changed UUIDs */ 724 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 725 BTRFS_FSID_SIZE) != 0 && 726 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 727 BTRFS_FSID_SIZE) == 0 && 728 memcmp(fs_devices->fsid, disk_super->fsid, 729 BTRFS_FSID_SIZE) != 0) 730 return fs_devices; 731 732 /* Unchanged UUIDs */ 733 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 734 BTRFS_FSID_SIZE) == 0 && 735 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 736 BTRFS_FSID_SIZE) == 0) 737 return fs_devices; 738 } 739 740 return NULL; 741 } 742 743 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 744 struct btrfs_super_block *disk_super) 745 { 746 struct btrfs_fs_devices *fs_devices; 747 748 /* 749 * Handle the case where the scanned device is part of an fs whose last 750 * metadata UUID change reverted it to the original FSID. At the same 751 * time * fs_devices was first created by another constitutent device 752 * which didn't fully observe the operation. This results in an 753 * btrfs_fs_devices created with metadata/fsid different AND 754 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 755 * fs_devices equal to the FSID of the disk. 756 */ 757 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 758 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 759 BTRFS_FSID_SIZE) != 0 && 760 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 761 BTRFS_FSID_SIZE) == 0 && 762 fs_devices->fsid_change) 763 return fs_devices; 764 } 765 766 return NULL; 767 } 768 /* 769 * Add new device to list of registered devices 770 * 771 * Returns: 772 * device pointer which was just added or updated when successful 773 * error pointer when failed 774 */ 775 static noinline struct btrfs_device *device_list_add(const char *path, 776 struct btrfs_super_block *disk_super, 777 bool *new_device_added) 778 { 779 struct btrfs_device *device; 780 struct btrfs_fs_devices *fs_devices = NULL; 781 struct rcu_string *name; 782 u64 found_transid = btrfs_super_generation(disk_super); 783 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 784 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 785 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 786 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 787 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 788 789 if (fsid_change_in_progress) { 790 if (!has_metadata_uuid) 791 fs_devices = find_fsid_inprogress(disk_super); 792 else 793 fs_devices = find_fsid_changed(disk_super); 794 } else if (has_metadata_uuid) { 795 fs_devices = find_fsid_with_metadata_uuid(disk_super); 796 } else { 797 fs_devices = find_fsid_reverted_metadata(disk_super); 798 if (!fs_devices) 799 fs_devices = find_fsid(disk_super->fsid, NULL); 800 } 801 802 803 if (!fs_devices) { 804 if (has_metadata_uuid) 805 fs_devices = alloc_fs_devices(disk_super->fsid, 806 disk_super->metadata_uuid); 807 else 808 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 809 810 if (IS_ERR(fs_devices)) 811 return ERR_CAST(fs_devices); 812 813 fs_devices->fsid_change = fsid_change_in_progress; 814 815 mutex_lock(&fs_devices->device_list_mutex); 816 list_add(&fs_devices->fs_list, &fs_uuids); 817 818 device = NULL; 819 } else { 820 struct btrfs_dev_lookup_args args = { 821 .devid = devid, 822 .uuid = disk_super->dev_item.uuid, 823 }; 824 825 mutex_lock(&fs_devices->device_list_mutex); 826 device = btrfs_find_device(fs_devices, &args); 827 828 /* 829 * If this disk has been pulled into an fs devices created by 830 * a device which had the CHANGING_FSID_V2 flag then replace the 831 * metadata_uuid/fsid values of the fs_devices. 832 */ 833 if (fs_devices->fsid_change && 834 found_transid > fs_devices->latest_generation) { 835 memcpy(fs_devices->fsid, disk_super->fsid, 836 BTRFS_FSID_SIZE); 837 838 if (has_metadata_uuid) 839 memcpy(fs_devices->metadata_uuid, 840 disk_super->metadata_uuid, 841 BTRFS_FSID_SIZE); 842 else 843 memcpy(fs_devices->metadata_uuid, 844 disk_super->fsid, BTRFS_FSID_SIZE); 845 846 fs_devices->fsid_change = false; 847 } 848 } 849 850 if (!device) { 851 if (fs_devices->opened) { 852 mutex_unlock(&fs_devices->device_list_mutex); 853 return ERR_PTR(-EBUSY); 854 } 855 856 device = btrfs_alloc_device(NULL, &devid, 857 disk_super->dev_item.uuid); 858 if (IS_ERR(device)) { 859 mutex_unlock(&fs_devices->device_list_mutex); 860 /* we can safely leave the fs_devices entry around */ 861 return device; 862 } 863 864 name = rcu_string_strdup(path, GFP_NOFS); 865 if (!name) { 866 btrfs_free_device(device); 867 mutex_unlock(&fs_devices->device_list_mutex); 868 return ERR_PTR(-ENOMEM); 869 } 870 rcu_assign_pointer(device->name, name); 871 872 list_add_rcu(&device->dev_list, &fs_devices->devices); 873 fs_devices->num_devices++; 874 875 device->fs_devices = fs_devices; 876 *new_device_added = true; 877 878 if (disk_super->label[0]) 879 pr_info( 880 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 881 disk_super->label, devid, found_transid, path, 882 current->comm, task_pid_nr(current)); 883 else 884 pr_info( 885 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 886 disk_super->fsid, devid, found_transid, path, 887 current->comm, task_pid_nr(current)); 888 889 } else if (!device->name || strcmp(device->name->str, path)) { 890 /* 891 * When FS is already mounted. 892 * 1. If you are here and if the device->name is NULL that 893 * means this device was missing at time of FS mount. 894 * 2. If you are here and if the device->name is different 895 * from 'path' that means either 896 * a. The same device disappeared and reappeared with 897 * different name. or 898 * b. The missing-disk-which-was-replaced, has 899 * reappeared now. 900 * 901 * We must allow 1 and 2a above. But 2b would be a spurious 902 * and unintentional. 903 * 904 * Further in case of 1 and 2a above, the disk at 'path' 905 * would have missed some transaction when it was away and 906 * in case of 2a the stale bdev has to be updated as well. 907 * 2b must not be allowed at all time. 908 */ 909 910 /* 911 * For now, we do allow update to btrfs_fs_device through the 912 * btrfs dev scan cli after FS has been mounted. We're still 913 * tracking a problem where systems fail mount by subvolume id 914 * when we reject replacement on a mounted FS. 915 */ 916 if (!fs_devices->opened && found_transid < device->generation) { 917 /* 918 * That is if the FS is _not_ mounted and if you 919 * are here, that means there is more than one 920 * disk with same uuid and devid.We keep the one 921 * with larger generation number or the last-in if 922 * generation are equal. 923 */ 924 mutex_unlock(&fs_devices->device_list_mutex); 925 return ERR_PTR(-EEXIST); 926 } 927 928 /* 929 * We are going to replace the device path for a given devid, 930 * make sure it's the same device if the device is mounted 931 */ 932 if (device->bdev) { 933 int error; 934 dev_t path_dev; 935 936 error = lookup_bdev(path, &path_dev); 937 if (error) { 938 mutex_unlock(&fs_devices->device_list_mutex); 939 return ERR_PTR(error); 940 } 941 942 if (device->bdev->bd_dev != path_dev) { 943 mutex_unlock(&fs_devices->device_list_mutex); 944 /* 945 * device->fs_info may not be reliable here, so 946 * pass in a NULL instead. This avoids a 947 * possible use-after-free when the fs_info and 948 * fs_info->sb are already torn down. 949 */ 950 btrfs_warn_in_rcu(NULL, 951 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 952 path, devid, found_transid, 953 current->comm, 954 task_pid_nr(current)); 955 return ERR_PTR(-EEXIST); 956 } 957 btrfs_info_in_rcu(device->fs_info, 958 "devid %llu device path %s changed to %s scanned by %s (%d)", 959 devid, rcu_str_deref(device->name), 960 path, current->comm, 961 task_pid_nr(current)); 962 } 963 964 name = rcu_string_strdup(path, GFP_NOFS); 965 if (!name) { 966 mutex_unlock(&fs_devices->device_list_mutex); 967 return ERR_PTR(-ENOMEM); 968 } 969 rcu_string_free(device->name); 970 rcu_assign_pointer(device->name, name); 971 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 972 fs_devices->missing_devices--; 973 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 974 } 975 } 976 977 /* 978 * Unmount does not free the btrfs_device struct but would zero 979 * generation along with most of the other members. So just update 980 * it back. We need it to pick the disk with largest generation 981 * (as above). 982 */ 983 if (!fs_devices->opened) { 984 device->generation = found_transid; 985 fs_devices->latest_generation = max_t(u64, found_transid, 986 fs_devices->latest_generation); 987 } 988 989 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 990 991 mutex_unlock(&fs_devices->device_list_mutex); 992 return device; 993 } 994 995 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 996 { 997 struct btrfs_fs_devices *fs_devices; 998 struct btrfs_device *device; 999 struct btrfs_device *orig_dev; 1000 int ret = 0; 1001 1002 lockdep_assert_held(&uuid_mutex); 1003 1004 fs_devices = alloc_fs_devices(orig->fsid, NULL); 1005 if (IS_ERR(fs_devices)) 1006 return fs_devices; 1007 1008 fs_devices->total_devices = orig->total_devices; 1009 1010 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1011 struct rcu_string *name; 1012 1013 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1014 orig_dev->uuid); 1015 if (IS_ERR(device)) { 1016 ret = PTR_ERR(device); 1017 goto error; 1018 } 1019 1020 /* 1021 * This is ok to do without rcu read locked because we hold the 1022 * uuid mutex so nothing we touch in here is going to disappear. 1023 */ 1024 if (orig_dev->name) { 1025 name = rcu_string_strdup(orig_dev->name->str, 1026 GFP_KERNEL); 1027 if (!name) { 1028 btrfs_free_device(device); 1029 ret = -ENOMEM; 1030 goto error; 1031 } 1032 rcu_assign_pointer(device->name, name); 1033 } 1034 1035 list_add(&device->dev_list, &fs_devices->devices); 1036 device->fs_devices = fs_devices; 1037 fs_devices->num_devices++; 1038 } 1039 return fs_devices; 1040 error: 1041 free_fs_devices(fs_devices); 1042 return ERR_PTR(ret); 1043 } 1044 1045 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1046 struct btrfs_device **latest_dev) 1047 { 1048 struct btrfs_device *device, *next; 1049 1050 /* This is the initialized path, it is safe to release the devices. */ 1051 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1052 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1053 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1054 &device->dev_state) && 1055 !test_bit(BTRFS_DEV_STATE_MISSING, 1056 &device->dev_state) && 1057 (!*latest_dev || 1058 device->generation > (*latest_dev)->generation)) { 1059 *latest_dev = device; 1060 } 1061 continue; 1062 } 1063 1064 /* 1065 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1066 * in btrfs_init_dev_replace() so just continue. 1067 */ 1068 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1069 continue; 1070 1071 if (device->bdev) { 1072 blkdev_put(device->bdev, device->mode); 1073 device->bdev = NULL; 1074 fs_devices->open_devices--; 1075 } 1076 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1077 list_del_init(&device->dev_alloc_list); 1078 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1079 fs_devices->rw_devices--; 1080 } 1081 list_del_init(&device->dev_list); 1082 fs_devices->num_devices--; 1083 btrfs_free_device(device); 1084 } 1085 1086 } 1087 1088 /* 1089 * After we have read the system tree and know devids belonging to this 1090 * filesystem, remove the device which does not belong there. 1091 */ 1092 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1093 { 1094 struct btrfs_device *latest_dev = NULL; 1095 struct btrfs_fs_devices *seed_dev; 1096 1097 mutex_lock(&uuid_mutex); 1098 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1099 1100 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1101 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1102 1103 fs_devices->latest_dev = latest_dev; 1104 1105 mutex_unlock(&uuid_mutex); 1106 } 1107 1108 static void btrfs_close_bdev(struct btrfs_device *device) 1109 { 1110 if (!device->bdev) 1111 return; 1112 1113 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1114 sync_blockdev(device->bdev); 1115 invalidate_bdev(device->bdev); 1116 } 1117 1118 blkdev_put(device->bdev, device->mode); 1119 } 1120 1121 static void btrfs_close_one_device(struct btrfs_device *device) 1122 { 1123 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1124 1125 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1126 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1127 list_del_init(&device->dev_alloc_list); 1128 fs_devices->rw_devices--; 1129 } 1130 1131 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1132 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1133 1134 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1135 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1136 fs_devices->missing_devices--; 1137 } 1138 1139 btrfs_close_bdev(device); 1140 if (device->bdev) { 1141 fs_devices->open_devices--; 1142 device->bdev = NULL; 1143 } 1144 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1145 btrfs_destroy_dev_zone_info(device); 1146 1147 device->fs_info = NULL; 1148 atomic_set(&device->dev_stats_ccnt, 0); 1149 extent_io_tree_release(&device->alloc_state); 1150 1151 /* 1152 * Reset the flush error record. We might have a transient flush error 1153 * in this mount, and if so we aborted the current transaction and set 1154 * the fs to an error state, guaranteeing no super blocks can be further 1155 * committed. However that error might be transient and if we unmount the 1156 * filesystem and mount it again, we should allow the mount to succeed 1157 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1158 * filesystem again we still get flush errors, then we will again abort 1159 * any transaction and set the error state, guaranteeing no commits of 1160 * unsafe super blocks. 1161 */ 1162 device->last_flush_error = 0; 1163 1164 /* Verify the device is back in a pristine state */ 1165 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1166 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1167 ASSERT(list_empty(&device->dev_alloc_list)); 1168 ASSERT(list_empty(&device->post_commit_list)); 1169 } 1170 1171 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1172 { 1173 struct btrfs_device *device, *tmp; 1174 1175 lockdep_assert_held(&uuid_mutex); 1176 1177 if (--fs_devices->opened > 0) 1178 return; 1179 1180 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1181 btrfs_close_one_device(device); 1182 1183 WARN_ON(fs_devices->open_devices); 1184 WARN_ON(fs_devices->rw_devices); 1185 fs_devices->opened = 0; 1186 fs_devices->seeding = false; 1187 fs_devices->fs_info = NULL; 1188 } 1189 1190 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1191 { 1192 LIST_HEAD(list); 1193 struct btrfs_fs_devices *tmp; 1194 1195 mutex_lock(&uuid_mutex); 1196 close_fs_devices(fs_devices); 1197 if (!fs_devices->opened) 1198 list_splice_init(&fs_devices->seed_list, &list); 1199 1200 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1201 close_fs_devices(fs_devices); 1202 list_del(&fs_devices->seed_list); 1203 free_fs_devices(fs_devices); 1204 } 1205 mutex_unlock(&uuid_mutex); 1206 } 1207 1208 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1209 fmode_t flags, void *holder) 1210 { 1211 struct btrfs_device *device; 1212 struct btrfs_device *latest_dev = NULL; 1213 struct btrfs_device *tmp_device; 1214 1215 flags |= FMODE_EXCL; 1216 1217 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1218 dev_list) { 1219 int ret; 1220 1221 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1222 if (ret == 0 && 1223 (!latest_dev || device->generation > latest_dev->generation)) { 1224 latest_dev = device; 1225 } else if (ret == -ENODATA) { 1226 fs_devices->num_devices--; 1227 list_del(&device->dev_list); 1228 btrfs_free_device(device); 1229 } 1230 } 1231 if (fs_devices->open_devices == 0) 1232 return -EINVAL; 1233 1234 fs_devices->opened = 1; 1235 fs_devices->latest_dev = latest_dev; 1236 fs_devices->total_rw_bytes = 0; 1237 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1238 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1239 1240 return 0; 1241 } 1242 1243 static int devid_cmp(void *priv, const struct list_head *a, 1244 const struct list_head *b) 1245 { 1246 const struct btrfs_device *dev1, *dev2; 1247 1248 dev1 = list_entry(a, struct btrfs_device, dev_list); 1249 dev2 = list_entry(b, struct btrfs_device, dev_list); 1250 1251 if (dev1->devid < dev2->devid) 1252 return -1; 1253 else if (dev1->devid > dev2->devid) 1254 return 1; 1255 return 0; 1256 } 1257 1258 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1259 fmode_t flags, void *holder) 1260 { 1261 int ret; 1262 1263 lockdep_assert_held(&uuid_mutex); 1264 /* 1265 * The device_list_mutex cannot be taken here in case opening the 1266 * underlying device takes further locks like open_mutex. 1267 * 1268 * We also don't need the lock here as this is called during mount and 1269 * exclusion is provided by uuid_mutex 1270 */ 1271 1272 if (fs_devices->opened) { 1273 fs_devices->opened++; 1274 ret = 0; 1275 } else { 1276 list_sort(NULL, &fs_devices->devices, devid_cmp); 1277 ret = open_fs_devices(fs_devices, flags, holder); 1278 } 1279 1280 return ret; 1281 } 1282 1283 void btrfs_release_disk_super(struct btrfs_super_block *super) 1284 { 1285 struct page *page = virt_to_page(super); 1286 1287 put_page(page); 1288 } 1289 1290 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1291 u64 bytenr, u64 bytenr_orig) 1292 { 1293 struct btrfs_super_block *disk_super; 1294 struct page *page; 1295 void *p; 1296 pgoff_t index; 1297 1298 /* make sure our super fits in the device */ 1299 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1300 return ERR_PTR(-EINVAL); 1301 1302 /* make sure our super fits in the page */ 1303 if (sizeof(*disk_super) > PAGE_SIZE) 1304 return ERR_PTR(-EINVAL); 1305 1306 /* make sure our super doesn't straddle pages on disk */ 1307 index = bytenr >> PAGE_SHIFT; 1308 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1309 return ERR_PTR(-EINVAL); 1310 1311 /* pull in the page with our super */ 1312 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1313 1314 if (IS_ERR(page)) 1315 return ERR_CAST(page); 1316 1317 p = page_address(page); 1318 1319 /* align our pointer to the offset of the super block */ 1320 disk_super = p + offset_in_page(bytenr); 1321 1322 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1323 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1324 btrfs_release_disk_super(p); 1325 return ERR_PTR(-EINVAL); 1326 } 1327 1328 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1329 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1330 1331 return disk_super; 1332 } 1333 1334 int btrfs_forget_devices(const char *path) 1335 { 1336 int ret; 1337 1338 mutex_lock(&uuid_mutex); 1339 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1340 mutex_unlock(&uuid_mutex); 1341 1342 return ret; 1343 } 1344 1345 /* 1346 * Look for a btrfs signature on a device. This may be called out of the mount path 1347 * and we are not allowed to call set_blocksize during the scan. The superblock 1348 * is read via pagecache 1349 */ 1350 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1351 void *holder) 1352 { 1353 struct btrfs_super_block *disk_super; 1354 bool new_device_added = false; 1355 struct btrfs_device *device = NULL; 1356 struct block_device *bdev; 1357 u64 bytenr, bytenr_orig; 1358 int ret; 1359 1360 lockdep_assert_held(&uuid_mutex); 1361 1362 /* 1363 * we would like to check all the supers, but that would make 1364 * a btrfs mount succeed after a mkfs from a different FS. 1365 * So, we need to add a special mount option to scan for 1366 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1367 */ 1368 flags |= FMODE_EXCL; 1369 1370 bdev = blkdev_get_by_path(path, flags, holder); 1371 if (IS_ERR(bdev)) 1372 return ERR_CAST(bdev); 1373 1374 bytenr_orig = btrfs_sb_offset(0); 1375 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1376 if (ret) { 1377 device = ERR_PTR(ret); 1378 goto error_bdev_put; 1379 } 1380 1381 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1382 if (IS_ERR(disk_super)) { 1383 device = ERR_CAST(disk_super); 1384 goto error_bdev_put; 1385 } 1386 1387 device = device_list_add(path, disk_super, &new_device_added); 1388 if (!IS_ERR(device)) { 1389 if (new_device_added) 1390 btrfs_free_stale_devices(path, device); 1391 } 1392 1393 btrfs_release_disk_super(disk_super); 1394 1395 error_bdev_put: 1396 blkdev_put(bdev, flags); 1397 1398 return device; 1399 } 1400 1401 /* 1402 * Try to find a chunk that intersects [start, start + len] range and when one 1403 * such is found, record the end of it in *start 1404 */ 1405 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1406 u64 len) 1407 { 1408 u64 physical_start, physical_end; 1409 1410 lockdep_assert_held(&device->fs_info->chunk_mutex); 1411 1412 if (!find_first_extent_bit(&device->alloc_state, *start, 1413 &physical_start, &physical_end, 1414 CHUNK_ALLOCATED, NULL)) { 1415 1416 if (in_range(physical_start, *start, len) || 1417 in_range(*start, physical_start, 1418 physical_end - physical_start)) { 1419 *start = physical_end + 1; 1420 return true; 1421 } 1422 } 1423 return false; 1424 } 1425 1426 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1427 { 1428 switch (device->fs_devices->chunk_alloc_policy) { 1429 case BTRFS_CHUNK_ALLOC_REGULAR: 1430 /* 1431 * We don't want to overwrite the superblock on the drive nor 1432 * any area used by the boot loader (grub for example), so we 1433 * make sure to start at an offset of at least 1MB. 1434 */ 1435 return max_t(u64, start, SZ_1M); 1436 case BTRFS_CHUNK_ALLOC_ZONED: 1437 /* 1438 * We don't care about the starting region like regular 1439 * allocator, because we anyway use/reserve the first two zones 1440 * for superblock logging. 1441 */ 1442 return ALIGN(start, device->zone_info->zone_size); 1443 default: 1444 BUG(); 1445 } 1446 } 1447 1448 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1449 u64 *hole_start, u64 *hole_size, 1450 u64 num_bytes) 1451 { 1452 u64 zone_size = device->zone_info->zone_size; 1453 u64 pos; 1454 int ret; 1455 bool changed = false; 1456 1457 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1458 1459 while (*hole_size > 0) { 1460 pos = btrfs_find_allocatable_zones(device, *hole_start, 1461 *hole_start + *hole_size, 1462 num_bytes); 1463 if (pos != *hole_start) { 1464 *hole_size = *hole_start + *hole_size - pos; 1465 *hole_start = pos; 1466 changed = true; 1467 if (*hole_size < num_bytes) 1468 break; 1469 } 1470 1471 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1472 1473 /* Range is ensured to be empty */ 1474 if (!ret) 1475 return changed; 1476 1477 /* Given hole range was invalid (outside of device) */ 1478 if (ret == -ERANGE) { 1479 *hole_start += *hole_size; 1480 *hole_size = 0; 1481 return true; 1482 } 1483 1484 *hole_start += zone_size; 1485 *hole_size -= zone_size; 1486 changed = true; 1487 } 1488 1489 return changed; 1490 } 1491 1492 /** 1493 * dev_extent_hole_check - check if specified hole is suitable for allocation 1494 * @device: the device which we have the hole 1495 * @hole_start: starting position of the hole 1496 * @hole_size: the size of the hole 1497 * @num_bytes: the size of the free space that we need 1498 * 1499 * This function may modify @hole_start and @hole_size to reflect the suitable 1500 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1501 */ 1502 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1503 u64 *hole_size, u64 num_bytes) 1504 { 1505 bool changed = false; 1506 u64 hole_end = *hole_start + *hole_size; 1507 1508 for (;;) { 1509 /* 1510 * Check before we set max_hole_start, otherwise we could end up 1511 * sending back this offset anyway. 1512 */ 1513 if (contains_pending_extent(device, hole_start, *hole_size)) { 1514 if (hole_end >= *hole_start) 1515 *hole_size = hole_end - *hole_start; 1516 else 1517 *hole_size = 0; 1518 changed = true; 1519 } 1520 1521 switch (device->fs_devices->chunk_alloc_policy) { 1522 case BTRFS_CHUNK_ALLOC_REGULAR: 1523 /* No extra check */ 1524 break; 1525 case BTRFS_CHUNK_ALLOC_ZONED: 1526 if (dev_extent_hole_check_zoned(device, hole_start, 1527 hole_size, num_bytes)) { 1528 changed = true; 1529 /* 1530 * The changed hole can contain pending extent. 1531 * Loop again to check that. 1532 */ 1533 continue; 1534 } 1535 break; 1536 default: 1537 BUG(); 1538 } 1539 1540 break; 1541 } 1542 1543 return changed; 1544 } 1545 1546 /* 1547 * find_free_dev_extent_start - find free space in the specified device 1548 * @device: the device which we search the free space in 1549 * @num_bytes: the size of the free space that we need 1550 * @search_start: the position from which to begin the search 1551 * @start: store the start of the free space. 1552 * @len: the size of the free space. that we find, or the size 1553 * of the max free space if we don't find suitable free space 1554 * 1555 * this uses a pretty simple search, the expectation is that it is 1556 * called very infrequently and that a given device has a small number 1557 * of extents 1558 * 1559 * @start is used to store the start of the free space if we find. But if we 1560 * don't find suitable free space, it will be used to store the start position 1561 * of the max free space. 1562 * 1563 * @len is used to store the size of the free space that we find. 1564 * But if we don't find suitable free space, it is used to store the size of 1565 * the max free space. 1566 * 1567 * NOTE: This function will search *commit* root of device tree, and does extra 1568 * check to ensure dev extents are not double allocated. 1569 * This makes the function safe to allocate dev extents but may not report 1570 * correct usable device space, as device extent freed in current transaction 1571 * is not reported as available. 1572 */ 1573 static int find_free_dev_extent_start(struct btrfs_device *device, 1574 u64 num_bytes, u64 search_start, u64 *start, 1575 u64 *len) 1576 { 1577 struct btrfs_fs_info *fs_info = device->fs_info; 1578 struct btrfs_root *root = fs_info->dev_root; 1579 struct btrfs_key key; 1580 struct btrfs_dev_extent *dev_extent; 1581 struct btrfs_path *path; 1582 u64 hole_size; 1583 u64 max_hole_start; 1584 u64 max_hole_size; 1585 u64 extent_end; 1586 u64 search_end = device->total_bytes; 1587 int ret; 1588 int slot; 1589 struct extent_buffer *l; 1590 1591 search_start = dev_extent_search_start(device, search_start); 1592 1593 WARN_ON(device->zone_info && 1594 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1595 1596 path = btrfs_alloc_path(); 1597 if (!path) 1598 return -ENOMEM; 1599 1600 max_hole_start = search_start; 1601 max_hole_size = 0; 1602 1603 again: 1604 if (search_start >= search_end || 1605 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1606 ret = -ENOSPC; 1607 goto out; 1608 } 1609 1610 path->reada = READA_FORWARD; 1611 path->search_commit_root = 1; 1612 path->skip_locking = 1; 1613 1614 key.objectid = device->devid; 1615 key.offset = search_start; 1616 key.type = BTRFS_DEV_EXTENT_KEY; 1617 1618 ret = btrfs_search_backwards(root, &key, path); 1619 if (ret < 0) 1620 goto out; 1621 1622 while (1) { 1623 l = path->nodes[0]; 1624 slot = path->slots[0]; 1625 if (slot >= btrfs_header_nritems(l)) { 1626 ret = btrfs_next_leaf(root, path); 1627 if (ret == 0) 1628 continue; 1629 if (ret < 0) 1630 goto out; 1631 1632 break; 1633 } 1634 btrfs_item_key_to_cpu(l, &key, slot); 1635 1636 if (key.objectid < device->devid) 1637 goto next; 1638 1639 if (key.objectid > device->devid) 1640 break; 1641 1642 if (key.type != BTRFS_DEV_EXTENT_KEY) 1643 goto next; 1644 1645 if (key.offset > search_start) { 1646 hole_size = key.offset - search_start; 1647 dev_extent_hole_check(device, &search_start, &hole_size, 1648 num_bytes); 1649 1650 if (hole_size > max_hole_size) { 1651 max_hole_start = search_start; 1652 max_hole_size = hole_size; 1653 } 1654 1655 /* 1656 * If this free space is greater than which we need, 1657 * it must be the max free space that we have found 1658 * until now, so max_hole_start must point to the start 1659 * of this free space and the length of this free space 1660 * is stored in max_hole_size. Thus, we return 1661 * max_hole_start and max_hole_size and go back to the 1662 * caller. 1663 */ 1664 if (hole_size >= num_bytes) { 1665 ret = 0; 1666 goto out; 1667 } 1668 } 1669 1670 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1671 extent_end = key.offset + btrfs_dev_extent_length(l, 1672 dev_extent); 1673 if (extent_end > search_start) 1674 search_start = extent_end; 1675 next: 1676 path->slots[0]++; 1677 cond_resched(); 1678 } 1679 1680 /* 1681 * At this point, search_start should be the end of 1682 * allocated dev extents, and when shrinking the device, 1683 * search_end may be smaller than search_start. 1684 */ 1685 if (search_end > search_start) { 1686 hole_size = search_end - search_start; 1687 if (dev_extent_hole_check(device, &search_start, &hole_size, 1688 num_bytes)) { 1689 btrfs_release_path(path); 1690 goto again; 1691 } 1692 1693 if (hole_size > max_hole_size) { 1694 max_hole_start = search_start; 1695 max_hole_size = hole_size; 1696 } 1697 } 1698 1699 /* See above. */ 1700 if (max_hole_size < num_bytes) 1701 ret = -ENOSPC; 1702 else 1703 ret = 0; 1704 1705 out: 1706 btrfs_free_path(path); 1707 *start = max_hole_start; 1708 if (len) 1709 *len = max_hole_size; 1710 return ret; 1711 } 1712 1713 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1714 u64 *start, u64 *len) 1715 { 1716 /* FIXME use last free of some kind */ 1717 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1718 } 1719 1720 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1721 struct btrfs_device *device, 1722 u64 start, u64 *dev_extent_len) 1723 { 1724 struct btrfs_fs_info *fs_info = device->fs_info; 1725 struct btrfs_root *root = fs_info->dev_root; 1726 int ret; 1727 struct btrfs_path *path; 1728 struct btrfs_key key; 1729 struct btrfs_key found_key; 1730 struct extent_buffer *leaf = NULL; 1731 struct btrfs_dev_extent *extent = NULL; 1732 1733 path = btrfs_alloc_path(); 1734 if (!path) 1735 return -ENOMEM; 1736 1737 key.objectid = device->devid; 1738 key.offset = start; 1739 key.type = BTRFS_DEV_EXTENT_KEY; 1740 again: 1741 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1742 if (ret > 0) { 1743 ret = btrfs_previous_item(root, path, key.objectid, 1744 BTRFS_DEV_EXTENT_KEY); 1745 if (ret) 1746 goto out; 1747 leaf = path->nodes[0]; 1748 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1749 extent = btrfs_item_ptr(leaf, path->slots[0], 1750 struct btrfs_dev_extent); 1751 BUG_ON(found_key.offset > start || found_key.offset + 1752 btrfs_dev_extent_length(leaf, extent) < start); 1753 key = found_key; 1754 btrfs_release_path(path); 1755 goto again; 1756 } else if (ret == 0) { 1757 leaf = path->nodes[0]; 1758 extent = btrfs_item_ptr(leaf, path->slots[0], 1759 struct btrfs_dev_extent); 1760 } else { 1761 goto out; 1762 } 1763 1764 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1765 1766 ret = btrfs_del_item(trans, root, path); 1767 if (ret == 0) 1768 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1769 out: 1770 btrfs_free_path(path); 1771 return ret; 1772 } 1773 1774 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1775 { 1776 struct extent_map_tree *em_tree; 1777 struct extent_map *em; 1778 struct rb_node *n; 1779 u64 ret = 0; 1780 1781 em_tree = &fs_info->mapping_tree; 1782 read_lock(&em_tree->lock); 1783 n = rb_last(&em_tree->map.rb_root); 1784 if (n) { 1785 em = rb_entry(n, struct extent_map, rb_node); 1786 ret = em->start + em->len; 1787 } 1788 read_unlock(&em_tree->lock); 1789 1790 return ret; 1791 } 1792 1793 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1794 u64 *devid_ret) 1795 { 1796 int ret; 1797 struct btrfs_key key; 1798 struct btrfs_key found_key; 1799 struct btrfs_path *path; 1800 1801 path = btrfs_alloc_path(); 1802 if (!path) 1803 return -ENOMEM; 1804 1805 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1806 key.type = BTRFS_DEV_ITEM_KEY; 1807 key.offset = (u64)-1; 1808 1809 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1810 if (ret < 0) 1811 goto error; 1812 1813 if (ret == 0) { 1814 /* Corruption */ 1815 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1816 ret = -EUCLEAN; 1817 goto error; 1818 } 1819 1820 ret = btrfs_previous_item(fs_info->chunk_root, path, 1821 BTRFS_DEV_ITEMS_OBJECTID, 1822 BTRFS_DEV_ITEM_KEY); 1823 if (ret) { 1824 *devid_ret = 1; 1825 } else { 1826 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1827 path->slots[0]); 1828 *devid_ret = found_key.offset + 1; 1829 } 1830 ret = 0; 1831 error: 1832 btrfs_free_path(path); 1833 return ret; 1834 } 1835 1836 /* 1837 * the device information is stored in the chunk root 1838 * the btrfs_device struct should be fully filled in 1839 */ 1840 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1841 struct btrfs_device *device) 1842 { 1843 int ret; 1844 struct btrfs_path *path; 1845 struct btrfs_dev_item *dev_item; 1846 struct extent_buffer *leaf; 1847 struct btrfs_key key; 1848 unsigned long ptr; 1849 1850 path = btrfs_alloc_path(); 1851 if (!path) 1852 return -ENOMEM; 1853 1854 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1855 key.type = BTRFS_DEV_ITEM_KEY; 1856 key.offset = device->devid; 1857 1858 btrfs_reserve_chunk_metadata(trans, true); 1859 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1860 &key, sizeof(*dev_item)); 1861 btrfs_trans_release_chunk_metadata(trans); 1862 if (ret) 1863 goto out; 1864 1865 leaf = path->nodes[0]; 1866 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1867 1868 btrfs_set_device_id(leaf, dev_item, device->devid); 1869 btrfs_set_device_generation(leaf, dev_item, 0); 1870 btrfs_set_device_type(leaf, dev_item, device->type); 1871 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1872 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1873 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1874 btrfs_set_device_total_bytes(leaf, dev_item, 1875 btrfs_device_get_disk_total_bytes(device)); 1876 btrfs_set_device_bytes_used(leaf, dev_item, 1877 btrfs_device_get_bytes_used(device)); 1878 btrfs_set_device_group(leaf, dev_item, 0); 1879 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1880 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1881 btrfs_set_device_start_offset(leaf, dev_item, 0); 1882 1883 ptr = btrfs_device_uuid(dev_item); 1884 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1885 ptr = btrfs_device_fsid(dev_item); 1886 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1887 ptr, BTRFS_FSID_SIZE); 1888 btrfs_mark_buffer_dirty(leaf); 1889 1890 ret = 0; 1891 out: 1892 btrfs_free_path(path); 1893 return ret; 1894 } 1895 1896 /* 1897 * Function to update ctime/mtime for a given device path. 1898 * Mainly used for ctime/mtime based probe like libblkid. 1899 * 1900 * We don't care about errors here, this is just to be kind to userspace. 1901 */ 1902 static void update_dev_time(const char *device_path) 1903 { 1904 struct path path; 1905 struct timespec64 now; 1906 int ret; 1907 1908 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1909 if (ret) 1910 return; 1911 1912 now = current_time(d_inode(path.dentry)); 1913 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); 1914 path_put(&path); 1915 } 1916 1917 static int btrfs_rm_dev_item(struct btrfs_device *device) 1918 { 1919 struct btrfs_root *root = device->fs_info->chunk_root; 1920 int ret; 1921 struct btrfs_path *path; 1922 struct btrfs_key key; 1923 struct btrfs_trans_handle *trans; 1924 1925 path = btrfs_alloc_path(); 1926 if (!path) 1927 return -ENOMEM; 1928 1929 trans = btrfs_start_transaction(root, 0); 1930 if (IS_ERR(trans)) { 1931 btrfs_free_path(path); 1932 return PTR_ERR(trans); 1933 } 1934 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1935 key.type = BTRFS_DEV_ITEM_KEY; 1936 key.offset = device->devid; 1937 1938 btrfs_reserve_chunk_metadata(trans, false); 1939 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1940 btrfs_trans_release_chunk_metadata(trans); 1941 if (ret) { 1942 if (ret > 0) 1943 ret = -ENOENT; 1944 btrfs_abort_transaction(trans, ret); 1945 btrfs_end_transaction(trans); 1946 goto out; 1947 } 1948 1949 ret = btrfs_del_item(trans, root, path); 1950 if (ret) { 1951 btrfs_abort_transaction(trans, ret); 1952 btrfs_end_transaction(trans); 1953 } 1954 1955 out: 1956 btrfs_free_path(path); 1957 if (!ret) 1958 ret = btrfs_commit_transaction(trans); 1959 return ret; 1960 } 1961 1962 /* 1963 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1964 * filesystem. It's up to the caller to adjust that number regarding eg. device 1965 * replace. 1966 */ 1967 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1968 u64 num_devices) 1969 { 1970 u64 all_avail; 1971 unsigned seq; 1972 int i; 1973 1974 do { 1975 seq = read_seqbegin(&fs_info->profiles_lock); 1976 1977 all_avail = fs_info->avail_data_alloc_bits | 1978 fs_info->avail_system_alloc_bits | 1979 fs_info->avail_metadata_alloc_bits; 1980 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1981 1982 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1983 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1984 continue; 1985 1986 if (num_devices < btrfs_raid_array[i].devs_min) 1987 return btrfs_raid_array[i].mindev_error; 1988 } 1989 1990 return 0; 1991 } 1992 1993 static struct btrfs_device * btrfs_find_next_active_device( 1994 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1995 { 1996 struct btrfs_device *next_device; 1997 1998 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1999 if (next_device != device && 2000 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 2001 && next_device->bdev) 2002 return next_device; 2003 } 2004 2005 return NULL; 2006 } 2007 2008 /* 2009 * Helper function to check if the given device is part of s_bdev / latest_dev 2010 * and replace it with the provided or the next active device, in the context 2011 * where this function called, there should be always be another device (or 2012 * this_dev) which is active. 2013 */ 2014 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 2015 struct btrfs_device *next_device) 2016 { 2017 struct btrfs_fs_info *fs_info = device->fs_info; 2018 2019 if (!next_device) 2020 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2021 device); 2022 ASSERT(next_device); 2023 2024 if (fs_info->sb->s_bdev && 2025 (fs_info->sb->s_bdev == device->bdev)) 2026 fs_info->sb->s_bdev = next_device->bdev; 2027 2028 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2029 fs_info->fs_devices->latest_dev = next_device; 2030 } 2031 2032 /* 2033 * Return btrfs_fs_devices::num_devices excluding the device that's being 2034 * currently replaced. 2035 */ 2036 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2037 { 2038 u64 num_devices = fs_info->fs_devices->num_devices; 2039 2040 down_read(&fs_info->dev_replace.rwsem); 2041 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2042 ASSERT(num_devices > 1); 2043 num_devices--; 2044 } 2045 up_read(&fs_info->dev_replace.rwsem); 2046 2047 return num_devices; 2048 } 2049 2050 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2051 struct block_device *bdev, 2052 const char *device_path) 2053 { 2054 struct btrfs_super_block *disk_super; 2055 int copy_num; 2056 2057 if (!bdev) 2058 return; 2059 2060 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2061 struct page *page; 2062 int ret; 2063 2064 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2065 if (IS_ERR(disk_super)) 2066 continue; 2067 2068 if (bdev_is_zoned(bdev)) { 2069 btrfs_reset_sb_log_zones(bdev, copy_num); 2070 continue; 2071 } 2072 2073 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2074 2075 page = virt_to_page(disk_super); 2076 set_page_dirty(page); 2077 lock_page(page); 2078 /* write_on_page() unlocks the page */ 2079 ret = write_one_page(page); 2080 if (ret) 2081 btrfs_warn(fs_info, 2082 "error clearing superblock number %d (%d)", 2083 copy_num, ret); 2084 btrfs_release_disk_super(disk_super); 2085 2086 } 2087 2088 /* Notify udev that device has changed */ 2089 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2090 2091 /* Update ctime/mtime for device path for libblkid */ 2092 update_dev_time(device_path); 2093 } 2094 2095 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2096 struct btrfs_dev_lookup_args *args, 2097 struct block_device **bdev, fmode_t *mode) 2098 { 2099 struct btrfs_device *device; 2100 struct btrfs_fs_devices *cur_devices; 2101 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2102 u64 num_devices; 2103 int ret = 0; 2104 2105 /* 2106 * The device list in fs_devices is accessed without locks (neither 2107 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2108 * filesystem and another device rm cannot run. 2109 */ 2110 num_devices = btrfs_num_devices(fs_info); 2111 2112 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2113 if (ret) 2114 goto out; 2115 2116 device = btrfs_find_device(fs_info->fs_devices, args); 2117 if (!device) { 2118 if (args->missing) 2119 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2120 else 2121 ret = -ENOENT; 2122 goto out; 2123 } 2124 2125 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2126 btrfs_warn_in_rcu(fs_info, 2127 "cannot remove device %s (devid %llu) due to active swapfile", 2128 rcu_str_deref(device->name), device->devid); 2129 ret = -ETXTBSY; 2130 goto out; 2131 } 2132 2133 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2134 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2135 goto out; 2136 } 2137 2138 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2139 fs_info->fs_devices->rw_devices == 1) { 2140 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2141 goto out; 2142 } 2143 2144 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2145 mutex_lock(&fs_info->chunk_mutex); 2146 list_del_init(&device->dev_alloc_list); 2147 device->fs_devices->rw_devices--; 2148 mutex_unlock(&fs_info->chunk_mutex); 2149 } 2150 2151 ret = btrfs_shrink_device(device, 0); 2152 if (ret) 2153 goto error_undo; 2154 2155 /* 2156 * TODO: the superblock still includes this device in its num_devices 2157 * counter although write_all_supers() is not locked out. This 2158 * could give a filesystem state which requires a degraded mount. 2159 */ 2160 ret = btrfs_rm_dev_item(device); 2161 if (ret) 2162 goto error_undo; 2163 2164 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2165 btrfs_scrub_cancel_dev(device); 2166 2167 /* 2168 * the device list mutex makes sure that we don't change 2169 * the device list while someone else is writing out all 2170 * the device supers. Whoever is writing all supers, should 2171 * lock the device list mutex before getting the number of 2172 * devices in the super block (super_copy). Conversely, 2173 * whoever updates the number of devices in the super block 2174 * (super_copy) should hold the device list mutex. 2175 */ 2176 2177 /* 2178 * In normal cases the cur_devices == fs_devices. But in case 2179 * of deleting a seed device, the cur_devices should point to 2180 * its own fs_devices listed under the fs_devices->seed_list. 2181 */ 2182 cur_devices = device->fs_devices; 2183 mutex_lock(&fs_devices->device_list_mutex); 2184 list_del_rcu(&device->dev_list); 2185 2186 cur_devices->num_devices--; 2187 cur_devices->total_devices--; 2188 /* Update total_devices of the parent fs_devices if it's seed */ 2189 if (cur_devices != fs_devices) 2190 fs_devices->total_devices--; 2191 2192 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2193 cur_devices->missing_devices--; 2194 2195 btrfs_assign_next_active_device(device, NULL); 2196 2197 if (device->bdev) { 2198 cur_devices->open_devices--; 2199 /* remove sysfs entry */ 2200 btrfs_sysfs_remove_device(device); 2201 } 2202 2203 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2204 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2205 mutex_unlock(&fs_devices->device_list_mutex); 2206 2207 /* 2208 * At this point, the device is zero sized and detached from the 2209 * devices list. All that's left is to zero out the old supers and 2210 * free the device. 2211 * 2212 * We cannot call btrfs_close_bdev() here because we're holding the sb 2213 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2214 * block device and it's dependencies. Instead just flush the device 2215 * and let the caller do the final blkdev_put. 2216 */ 2217 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2218 btrfs_scratch_superblocks(fs_info, device->bdev, 2219 device->name->str); 2220 if (device->bdev) { 2221 sync_blockdev(device->bdev); 2222 invalidate_bdev(device->bdev); 2223 } 2224 } 2225 2226 *bdev = device->bdev; 2227 *mode = device->mode; 2228 synchronize_rcu(); 2229 btrfs_free_device(device); 2230 2231 /* 2232 * This can happen if cur_devices is the private seed devices list. We 2233 * cannot call close_fs_devices() here because it expects the uuid_mutex 2234 * to be held, but in fact we don't need that for the private 2235 * seed_devices, we can simply decrement cur_devices->opened and then 2236 * remove it from our list and free the fs_devices. 2237 */ 2238 if (cur_devices->num_devices == 0) { 2239 list_del_init(&cur_devices->seed_list); 2240 ASSERT(cur_devices->opened == 1); 2241 cur_devices->opened--; 2242 free_fs_devices(cur_devices); 2243 } 2244 2245 out: 2246 return ret; 2247 2248 error_undo: 2249 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2250 mutex_lock(&fs_info->chunk_mutex); 2251 list_add(&device->dev_alloc_list, 2252 &fs_devices->alloc_list); 2253 device->fs_devices->rw_devices++; 2254 mutex_unlock(&fs_info->chunk_mutex); 2255 } 2256 goto out; 2257 } 2258 2259 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2260 { 2261 struct btrfs_fs_devices *fs_devices; 2262 2263 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2264 2265 /* 2266 * in case of fs with no seed, srcdev->fs_devices will point 2267 * to fs_devices of fs_info. However when the dev being replaced is 2268 * a seed dev it will point to the seed's local fs_devices. In short 2269 * srcdev will have its correct fs_devices in both the cases. 2270 */ 2271 fs_devices = srcdev->fs_devices; 2272 2273 list_del_rcu(&srcdev->dev_list); 2274 list_del(&srcdev->dev_alloc_list); 2275 fs_devices->num_devices--; 2276 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2277 fs_devices->missing_devices--; 2278 2279 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2280 fs_devices->rw_devices--; 2281 2282 if (srcdev->bdev) 2283 fs_devices->open_devices--; 2284 } 2285 2286 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2287 { 2288 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2289 2290 mutex_lock(&uuid_mutex); 2291 2292 btrfs_close_bdev(srcdev); 2293 synchronize_rcu(); 2294 btrfs_free_device(srcdev); 2295 2296 /* if this is no devs we rather delete the fs_devices */ 2297 if (!fs_devices->num_devices) { 2298 /* 2299 * On a mounted FS, num_devices can't be zero unless it's a 2300 * seed. In case of a seed device being replaced, the replace 2301 * target added to the sprout FS, so there will be no more 2302 * device left under the seed FS. 2303 */ 2304 ASSERT(fs_devices->seeding); 2305 2306 list_del_init(&fs_devices->seed_list); 2307 close_fs_devices(fs_devices); 2308 free_fs_devices(fs_devices); 2309 } 2310 mutex_unlock(&uuid_mutex); 2311 } 2312 2313 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2314 { 2315 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2316 2317 mutex_lock(&fs_devices->device_list_mutex); 2318 2319 btrfs_sysfs_remove_device(tgtdev); 2320 2321 if (tgtdev->bdev) 2322 fs_devices->open_devices--; 2323 2324 fs_devices->num_devices--; 2325 2326 btrfs_assign_next_active_device(tgtdev, NULL); 2327 2328 list_del_rcu(&tgtdev->dev_list); 2329 2330 mutex_unlock(&fs_devices->device_list_mutex); 2331 2332 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2333 tgtdev->name->str); 2334 2335 btrfs_close_bdev(tgtdev); 2336 synchronize_rcu(); 2337 btrfs_free_device(tgtdev); 2338 } 2339 2340 /** 2341 * Populate args from device at path 2342 * 2343 * @fs_info: the filesystem 2344 * @args: the args to populate 2345 * @path: the path to the device 2346 * 2347 * This will read the super block of the device at @path and populate @args with 2348 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2349 * lookup a device to operate on, but need to do it before we take any locks. 2350 * This properly handles the special case of "missing" that a user may pass in, 2351 * and does some basic sanity checks. The caller must make sure that @path is 2352 * properly NUL terminated before calling in, and must call 2353 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2354 * uuid buffers. 2355 * 2356 * Return: 0 for success, -errno for failure 2357 */ 2358 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2359 struct btrfs_dev_lookup_args *args, 2360 const char *path) 2361 { 2362 struct btrfs_super_block *disk_super; 2363 struct block_device *bdev; 2364 int ret; 2365 2366 if (!path || !path[0]) 2367 return -EINVAL; 2368 if (!strcmp(path, "missing")) { 2369 args->missing = true; 2370 return 0; 2371 } 2372 2373 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2374 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2375 if (!args->uuid || !args->fsid) { 2376 btrfs_put_dev_args_from_path(args); 2377 return -ENOMEM; 2378 } 2379 2380 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, 2381 &bdev, &disk_super); 2382 if (ret) 2383 return ret; 2384 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2385 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2386 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2387 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2388 else 2389 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2390 btrfs_release_disk_super(disk_super); 2391 blkdev_put(bdev, FMODE_READ); 2392 return 0; 2393 } 2394 2395 /* 2396 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2397 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2398 * that don't need to be freed. 2399 */ 2400 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2401 { 2402 kfree(args->uuid); 2403 kfree(args->fsid); 2404 args->uuid = NULL; 2405 args->fsid = NULL; 2406 } 2407 2408 struct btrfs_device *btrfs_find_device_by_devspec( 2409 struct btrfs_fs_info *fs_info, u64 devid, 2410 const char *device_path) 2411 { 2412 BTRFS_DEV_LOOKUP_ARGS(args); 2413 struct btrfs_device *device; 2414 int ret; 2415 2416 if (devid) { 2417 args.devid = devid; 2418 device = btrfs_find_device(fs_info->fs_devices, &args); 2419 if (!device) 2420 return ERR_PTR(-ENOENT); 2421 return device; 2422 } 2423 2424 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2425 if (ret) 2426 return ERR_PTR(ret); 2427 device = btrfs_find_device(fs_info->fs_devices, &args); 2428 btrfs_put_dev_args_from_path(&args); 2429 if (!device) 2430 return ERR_PTR(-ENOENT); 2431 return device; 2432 } 2433 2434 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2435 { 2436 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2437 struct btrfs_fs_devices *old_devices; 2438 struct btrfs_fs_devices *seed_devices; 2439 2440 lockdep_assert_held(&uuid_mutex); 2441 if (!fs_devices->seeding) 2442 return ERR_PTR(-EINVAL); 2443 2444 /* 2445 * Private copy of the seed devices, anchored at 2446 * fs_info->fs_devices->seed_list 2447 */ 2448 seed_devices = alloc_fs_devices(NULL, NULL); 2449 if (IS_ERR(seed_devices)) 2450 return seed_devices; 2451 2452 /* 2453 * It's necessary to retain a copy of the original seed fs_devices in 2454 * fs_uuids so that filesystems which have been seeded can successfully 2455 * reference the seed device from open_seed_devices. This also supports 2456 * multiple fs seed. 2457 */ 2458 old_devices = clone_fs_devices(fs_devices); 2459 if (IS_ERR(old_devices)) { 2460 kfree(seed_devices); 2461 return old_devices; 2462 } 2463 2464 list_add(&old_devices->fs_list, &fs_uuids); 2465 2466 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2467 seed_devices->opened = 1; 2468 INIT_LIST_HEAD(&seed_devices->devices); 2469 INIT_LIST_HEAD(&seed_devices->alloc_list); 2470 mutex_init(&seed_devices->device_list_mutex); 2471 2472 return seed_devices; 2473 } 2474 2475 /* 2476 * Splice seed devices into the sprout fs_devices. 2477 * Generate a new fsid for the sprouted read-write filesystem. 2478 */ 2479 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2480 struct btrfs_fs_devices *seed_devices) 2481 { 2482 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2483 struct btrfs_super_block *disk_super = fs_info->super_copy; 2484 struct btrfs_device *device; 2485 u64 super_flags; 2486 2487 /* 2488 * We are updating the fsid, the thread leading to device_list_add() 2489 * could race, so uuid_mutex is needed. 2490 */ 2491 lockdep_assert_held(&uuid_mutex); 2492 2493 /* 2494 * The threads listed below may traverse dev_list but can do that without 2495 * device_list_mutex: 2496 * - All device ops and balance - as we are in btrfs_exclop_start. 2497 * - Various dev_list readers - are using RCU. 2498 * - btrfs_ioctl_fitrim() - is using RCU. 2499 * 2500 * For-read threads as below are using device_list_mutex: 2501 * - Readonly scrub btrfs_scrub_dev() 2502 * - Readonly scrub btrfs_scrub_progress() 2503 * - btrfs_get_dev_stats() 2504 */ 2505 lockdep_assert_held(&fs_devices->device_list_mutex); 2506 2507 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2508 synchronize_rcu); 2509 list_for_each_entry(device, &seed_devices->devices, dev_list) 2510 device->fs_devices = seed_devices; 2511 2512 fs_devices->seeding = false; 2513 fs_devices->num_devices = 0; 2514 fs_devices->open_devices = 0; 2515 fs_devices->missing_devices = 0; 2516 fs_devices->rotating = false; 2517 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2518 2519 generate_random_uuid(fs_devices->fsid); 2520 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2521 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2522 2523 super_flags = btrfs_super_flags(disk_super) & 2524 ~BTRFS_SUPER_FLAG_SEEDING; 2525 btrfs_set_super_flags(disk_super, super_flags); 2526 } 2527 2528 /* 2529 * Store the expected generation for seed devices in device items. 2530 */ 2531 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2532 { 2533 BTRFS_DEV_LOOKUP_ARGS(args); 2534 struct btrfs_fs_info *fs_info = trans->fs_info; 2535 struct btrfs_root *root = fs_info->chunk_root; 2536 struct btrfs_path *path; 2537 struct extent_buffer *leaf; 2538 struct btrfs_dev_item *dev_item; 2539 struct btrfs_device *device; 2540 struct btrfs_key key; 2541 u8 fs_uuid[BTRFS_FSID_SIZE]; 2542 u8 dev_uuid[BTRFS_UUID_SIZE]; 2543 int ret; 2544 2545 path = btrfs_alloc_path(); 2546 if (!path) 2547 return -ENOMEM; 2548 2549 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2550 key.offset = 0; 2551 key.type = BTRFS_DEV_ITEM_KEY; 2552 2553 while (1) { 2554 btrfs_reserve_chunk_metadata(trans, false); 2555 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2556 btrfs_trans_release_chunk_metadata(trans); 2557 if (ret < 0) 2558 goto error; 2559 2560 leaf = path->nodes[0]; 2561 next_slot: 2562 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2563 ret = btrfs_next_leaf(root, path); 2564 if (ret > 0) 2565 break; 2566 if (ret < 0) 2567 goto error; 2568 leaf = path->nodes[0]; 2569 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2570 btrfs_release_path(path); 2571 continue; 2572 } 2573 2574 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2575 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2576 key.type != BTRFS_DEV_ITEM_KEY) 2577 break; 2578 2579 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2580 struct btrfs_dev_item); 2581 args.devid = btrfs_device_id(leaf, dev_item); 2582 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2583 BTRFS_UUID_SIZE); 2584 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2585 BTRFS_FSID_SIZE); 2586 args.uuid = dev_uuid; 2587 args.fsid = fs_uuid; 2588 device = btrfs_find_device(fs_info->fs_devices, &args); 2589 BUG_ON(!device); /* Logic error */ 2590 2591 if (device->fs_devices->seeding) { 2592 btrfs_set_device_generation(leaf, dev_item, 2593 device->generation); 2594 btrfs_mark_buffer_dirty(leaf); 2595 } 2596 2597 path->slots[0]++; 2598 goto next_slot; 2599 } 2600 ret = 0; 2601 error: 2602 btrfs_free_path(path); 2603 return ret; 2604 } 2605 2606 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2607 { 2608 struct btrfs_root *root = fs_info->dev_root; 2609 struct request_queue *q; 2610 struct btrfs_trans_handle *trans; 2611 struct btrfs_device *device; 2612 struct block_device *bdev; 2613 struct super_block *sb = fs_info->sb; 2614 struct rcu_string *name; 2615 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2616 struct btrfs_fs_devices *seed_devices; 2617 u64 orig_super_total_bytes; 2618 u64 orig_super_num_devices; 2619 int ret = 0; 2620 bool seeding_dev = false; 2621 bool locked = false; 2622 2623 if (sb_rdonly(sb) && !fs_devices->seeding) 2624 return -EROFS; 2625 2626 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2627 fs_info->bdev_holder); 2628 if (IS_ERR(bdev)) 2629 return PTR_ERR(bdev); 2630 2631 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2632 ret = -EINVAL; 2633 goto error; 2634 } 2635 2636 if (fs_devices->seeding) { 2637 seeding_dev = true; 2638 down_write(&sb->s_umount); 2639 mutex_lock(&uuid_mutex); 2640 locked = true; 2641 } 2642 2643 sync_blockdev(bdev); 2644 2645 rcu_read_lock(); 2646 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2647 if (device->bdev == bdev) { 2648 ret = -EEXIST; 2649 rcu_read_unlock(); 2650 goto error; 2651 } 2652 } 2653 rcu_read_unlock(); 2654 2655 device = btrfs_alloc_device(fs_info, NULL, NULL); 2656 if (IS_ERR(device)) { 2657 /* we can safely leave the fs_devices entry around */ 2658 ret = PTR_ERR(device); 2659 goto error; 2660 } 2661 2662 name = rcu_string_strdup(device_path, GFP_KERNEL); 2663 if (!name) { 2664 ret = -ENOMEM; 2665 goto error_free_device; 2666 } 2667 rcu_assign_pointer(device->name, name); 2668 2669 device->fs_info = fs_info; 2670 device->bdev = bdev; 2671 2672 ret = btrfs_get_dev_zone_info(device, false); 2673 if (ret) 2674 goto error_free_device; 2675 2676 trans = btrfs_start_transaction(root, 0); 2677 if (IS_ERR(trans)) { 2678 ret = PTR_ERR(trans); 2679 goto error_free_zone; 2680 } 2681 2682 q = bdev_get_queue(bdev); 2683 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2684 device->generation = trans->transid; 2685 device->io_width = fs_info->sectorsize; 2686 device->io_align = fs_info->sectorsize; 2687 device->sector_size = fs_info->sectorsize; 2688 device->total_bytes = 2689 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 2690 device->disk_total_bytes = device->total_bytes; 2691 device->commit_total_bytes = device->total_bytes; 2692 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2693 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2694 device->mode = FMODE_EXCL; 2695 device->dev_stats_valid = 1; 2696 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2697 2698 if (seeding_dev) { 2699 btrfs_clear_sb_rdonly(sb); 2700 2701 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2702 seed_devices = btrfs_init_sprout(fs_info); 2703 if (IS_ERR(seed_devices)) { 2704 ret = PTR_ERR(seed_devices); 2705 btrfs_abort_transaction(trans, ret); 2706 goto error_trans; 2707 } 2708 } 2709 2710 mutex_lock(&fs_devices->device_list_mutex); 2711 if (seeding_dev) { 2712 btrfs_setup_sprout(fs_info, seed_devices); 2713 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2714 device); 2715 } 2716 2717 device->fs_devices = fs_devices; 2718 2719 mutex_lock(&fs_info->chunk_mutex); 2720 list_add_rcu(&device->dev_list, &fs_devices->devices); 2721 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2722 fs_devices->num_devices++; 2723 fs_devices->open_devices++; 2724 fs_devices->rw_devices++; 2725 fs_devices->total_devices++; 2726 fs_devices->total_rw_bytes += device->total_bytes; 2727 2728 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2729 2730 if (!blk_queue_nonrot(q)) 2731 fs_devices->rotating = true; 2732 2733 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2734 btrfs_set_super_total_bytes(fs_info->super_copy, 2735 round_down(orig_super_total_bytes + device->total_bytes, 2736 fs_info->sectorsize)); 2737 2738 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2739 btrfs_set_super_num_devices(fs_info->super_copy, 2740 orig_super_num_devices + 1); 2741 2742 /* 2743 * we've got more storage, clear any full flags on the space 2744 * infos 2745 */ 2746 btrfs_clear_space_info_full(fs_info); 2747 2748 mutex_unlock(&fs_info->chunk_mutex); 2749 2750 /* Add sysfs device entry */ 2751 btrfs_sysfs_add_device(device); 2752 2753 mutex_unlock(&fs_devices->device_list_mutex); 2754 2755 if (seeding_dev) { 2756 mutex_lock(&fs_info->chunk_mutex); 2757 ret = init_first_rw_device(trans); 2758 mutex_unlock(&fs_info->chunk_mutex); 2759 if (ret) { 2760 btrfs_abort_transaction(trans, ret); 2761 goto error_sysfs; 2762 } 2763 } 2764 2765 ret = btrfs_add_dev_item(trans, device); 2766 if (ret) { 2767 btrfs_abort_transaction(trans, ret); 2768 goto error_sysfs; 2769 } 2770 2771 if (seeding_dev) { 2772 ret = btrfs_finish_sprout(trans); 2773 if (ret) { 2774 btrfs_abort_transaction(trans, ret); 2775 goto error_sysfs; 2776 } 2777 2778 /* 2779 * fs_devices now represents the newly sprouted filesystem and 2780 * its fsid has been changed by btrfs_sprout_splice(). 2781 */ 2782 btrfs_sysfs_update_sprout_fsid(fs_devices); 2783 } 2784 2785 ret = btrfs_commit_transaction(trans); 2786 2787 if (seeding_dev) { 2788 mutex_unlock(&uuid_mutex); 2789 up_write(&sb->s_umount); 2790 locked = false; 2791 2792 if (ret) /* transaction commit */ 2793 return ret; 2794 2795 ret = btrfs_relocate_sys_chunks(fs_info); 2796 if (ret < 0) 2797 btrfs_handle_fs_error(fs_info, ret, 2798 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2799 trans = btrfs_attach_transaction(root); 2800 if (IS_ERR(trans)) { 2801 if (PTR_ERR(trans) == -ENOENT) 2802 return 0; 2803 ret = PTR_ERR(trans); 2804 trans = NULL; 2805 goto error_sysfs; 2806 } 2807 ret = btrfs_commit_transaction(trans); 2808 } 2809 2810 /* 2811 * Now that we have written a new super block to this device, check all 2812 * other fs_devices list if device_path alienates any other scanned 2813 * device. 2814 * We can ignore the return value as it typically returns -EINVAL and 2815 * only succeeds if the device was an alien. 2816 */ 2817 btrfs_forget_devices(device_path); 2818 2819 /* Update ctime/mtime for blkid or udev */ 2820 update_dev_time(device_path); 2821 2822 return ret; 2823 2824 error_sysfs: 2825 btrfs_sysfs_remove_device(device); 2826 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2827 mutex_lock(&fs_info->chunk_mutex); 2828 list_del_rcu(&device->dev_list); 2829 list_del(&device->dev_alloc_list); 2830 fs_info->fs_devices->num_devices--; 2831 fs_info->fs_devices->open_devices--; 2832 fs_info->fs_devices->rw_devices--; 2833 fs_info->fs_devices->total_devices--; 2834 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2835 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2836 btrfs_set_super_total_bytes(fs_info->super_copy, 2837 orig_super_total_bytes); 2838 btrfs_set_super_num_devices(fs_info->super_copy, 2839 orig_super_num_devices); 2840 mutex_unlock(&fs_info->chunk_mutex); 2841 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2842 error_trans: 2843 if (seeding_dev) 2844 btrfs_set_sb_rdonly(sb); 2845 if (trans) 2846 btrfs_end_transaction(trans); 2847 error_free_zone: 2848 btrfs_destroy_dev_zone_info(device); 2849 error_free_device: 2850 btrfs_free_device(device); 2851 error: 2852 blkdev_put(bdev, FMODE_EXCL); 2853 if (locked) { 2854 mutex_unlock(&uuid_mutex); 2855 up_write(&sb->s_umount); 2856 } 2857 return ret; 2858 } 2859 2860 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2861 struct btrfs_device *device) 2862 { 2863 int ret; 2864 struct btrfs_path *path; 2865 struct btrfs_root *root = device->fs_info->chunk_root; 2866 struct btrfs_dev_item *dev_item; 2867 struct extent_buffer *leaf; 2868 struct btrfs_key key; 2869 2870 path = btrfs_alloc_path(); 2871 if (!path) 2872 return -ENOMEM; 2873 2874 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2875 key.type = BTRFS_DEV_ITEM_KEY; 2876 key.offset = device->devid; 2877 2878 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2879 if (ret < 0) 2880 goto out; 2881 2882 if (ret > 0) { 2883 ret = -ENOENT; 2884 goto out; 2885 } 2886 2887 leaf = path->nodes[0]; 2888 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2889 2890 btrfs_set_device_id(leaf, dev_item, device->devid); 2891 btrfs_set_device_type(leaf, dev_item, device->type); 2892 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2893 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2894 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2895 btrfs_set_device_total_bytes(leaf, dev_item, 2896 btrfs_device_get_disk_total_bytes(device)); 2897 btrfs_set_device_bytes_used(leaf, dev_item, 2898 btrfs_device_get_bytes_used(device)); 2899 btrfs_mark_buffer_dirty(leaf); 2900 2901 out: 2902 btrfs_free_path(path); 2903 return ret; 2904 } 2905 2906 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2907 struct btrfs_device *device, u64 new_size) 2908 { 2909 struct btrfs_fs_info *fs_info = device->fs_info; 2910 struct btrfs_super_block *super_copy = fs_info->super_copy; 2911 u64 old_total; 2912 u64 diff; 2913 int ret; 2914 2915 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2916 return -EACCES; 2917 2918 new_size = round_down(new_size, fs_info->sectorsize); 2919 2920 mutex_lock(&fs_info->chunk_mutex); 2921 old_total = btrfs_super_total_bytes(super_copy); 2922 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2923 2924 if (new_size <= device->total_bytes || 2925 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2926 mutex_unlock(&fs_info->chunk_mutex); 2927 return -EINVAL; 2928 } 2929 2930 btrfs_set_super_total_bytes(super_copy, 2931 round_down(old_total + diff, fs_info->sectorsize)); 2932 device->fs_devices->total_rw_bytes += diff; 2933 2934 btrfs_device_set_total_bytes(device, new_size); 2935 btrfs_device_set_disk_total_bytes(device, new_size); 2936 btrfs_clear_space_info_full(device->fs_info); 2937 if (list_empty(&device->post_commit_list)) 2938 list_add_tail(&device->post_commit_list, 2939 &trans->transaction->dev_update_list); 2940 mutex_unlock(&fs_info->chunk_mutex); 2941 2942 btrfs_reserve_chunk_metadata(trans, false); 2943 ret = btrfs_update_device(trans, device); 2944 btrfs_trans_release_chunk_metadata(trans); 2945 2946 return ret; 2947 } 2948 2949 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2950 { 2951 struct btrfs_fs_info *fs_info = trans->fs_info; 2952 struct btrfs_root *root = fs_info->chunk_root; 2953 int ret; 2954 struct btrfs_path *path; 2955 struct btrfs_key key; 2956 2957 path = btrfs_alloc_path(); 2958 if (!path) 2959 return -ENOMEM; 2960 2961 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2962 key.offset = chunk_offset; 2963 key.type = BTRFS_CHUNK_ITEM_KEY; 2964 2965 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2966 if (ret < 0) 2967 goto out; 2968 else if (ret > 0) { /* Logic error or corruption */ 2969 btrfs_handle_fs_error(fs_info, -ENOENT, 2970 "Failed lookup while freeing chunk."); 2971 ret = -ENOENT; 2972 goto out; 2973 } 2974 2975 ret = btrfs_del_item(trans, root, path); 2976 if (ret < 0) 2977 btrfs_handle_fs_error(fs_info, ret, 2978 "Failed to delete chunk item."); 2979 out: 2980 btrfs_free_path(path); 2981 return ret; 2982 } 2983 2984 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2985 { 2986 struct btrfs_super_block *super_copy = fs_info->super_copy; 2987 struct btrfs_disk_key *disk_key; 2988 struct btrfs_chunk *chunk; 2989 u8 *ptr; 2990 int ret = 0; 2991 u32 num_stripes; 2992 u32 array_size; 2993 u32 len = 0; 2994 u32 cur; 2995 struct btrfs_key key; 2996 2997 lockdep_assert_held(&fs_info->chunk_mutex); 2998 array_size = btrfs_super_sys_array_size(super_copy); 2999 3000 ptr = super_copy->sys_chunk_array; 3001 cur = 0; 3002 3003 while (cur < array_size) { 3004 disk_key = (struct btrfs_disk_key *)ptr; 3005 btrfs_disk_key_to_cpu(&key, disk_key); 3006 3007 len = sizeof(*disk_key); 3008 3009 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3010 chunk = (struct btrfs_chunk *)(ptr + len); 3011 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 3012 len += btrfs_chunk_item_size(num_stripes); 3013 } else { 3014 ret = -EIO; 3015 break; 3016 } 3017 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 3018 key.offset == chunk_offset) { 3019 memmove(ptr, ptr + len, array_size - (cur + len)); 3020 array_size -= len; 3021 btrfs_set_super_sys_array_size(super_copy, array_size); 3022 } else { 3023 ptr += len; 3024 cur += len; 3025 } 3026 } 3027 return ret; 3028 } 3029 3030 /* 3031 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 3032 * @logical: Logical block offset in bytes. 3033 * @length: Length of extent in bytes. 3034 * 3035 * Return: Chunk mapping or ERR_PTR. 3036 */ 3037 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3038 u64 logical, u64 length) 3039 { 3040 struct extent_map_tree *em_tree; 3041 struct extent_map *em; 3042 3043 em_tree = &fs_info->mapping_tree; 3044 read_lock(&em_tree->lock); 3045 em = lookup_extent_mapping(em_tree, logical, length); 3046 read_unlock(&em_tree->lock); 3047 3048 if (!em) { 3049 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 3050 logical, length); 3051 return ERR_PTR(-EINVAL); 3052 } 3053 3054 if (em->start > logical || em->start + em->len < logical) { 3055 btrfs_crit(fs_info, 3056 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3057 logical, length, em->start, em->start + em->len); 3058 free_extent_map(em); 3059 return ERR_PTR(-EINVAL); 3060 } 3061 3062 /* callers are responsible for dropping em's ref. */ 3063 return em; 3064 } 3065 3066 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3067 struct map_lookup *map, u64 chunk_offset) 3068 { 3069 int i; 3070 3071 /* 3072 * Removing chunk items and updating the device items in the chunks btree 3073 * requires holding the chunk_mutex. 3074 * See the comment at btrfs_chunk_alloc() for the details. 3075 */ 3076 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3077 3078 for (i = 0; i < map->num_stripes; i++) { 3079 int ret; 3080 3081 ret = btrfs_update_device(trans, map->stripes[i].dev); 3082 if (ret) 3083 return ret; 3084 } 3085 3086 return btrfs_free_chunk(trans, chunk_offset); 3087 } 3088 3089 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3090 { 3091 struct btrfs_fs_info *fs_info = trans->fs_info; 3092 struct extent_map *em; 3093 struct map_lookup *map; 3094 u64 dev_extent_len = 0; 3095 int i, ret = 0; 3096 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3097 3098 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3099 if (IS_ERR(em)) { 3100 /* 3101 * This is a logic error, but we don't want to just rely on the 3102 * user having built with ASSERT enabled, so if ASSERT doesn't 3103 * do anything we still error out. 3104 */ 3105 ASSERT(0); 3106 return PTR_ERR(em); 3107 } 3108 map = em->map_lookup; 3109 3110 /* 3111 * First delete the device extent items from the devices btree. 3112 * We take the device_list_mutex to avoid racing with the finishing phase 3113 * of a device replace operation. See the comment below before acquiring 3114 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3115 * because that can result in a deadlock when deleting the device extent 3116 * items from the devices btree - COWing an extent buffer from the btree 3117 * may result in allocating a new metadata chunk, which would attempt to 3118 * lock again fs_info->chunk_mutex. 3119 */ 3120 mutex_lock(&fs_devices->device_list_mutex); 3121 for (i = 0; i < map->num_stripes; i++) { 3122 struct btrfs_device *device = map->stripes[i].dev; 3123 ret = btrfs_free_dev_extent(trans, device, 3124 map->stripes[i].physical, 3125 &dev_extent_len); 3126 if (ret) { 3127 mutex_unlock(&fs_devices->device_list_mutex); 3128 btrfs_abort_transaction(trans, ret); 3129 goto out; 3130 } 3131 3132 if (device->bytes_used > 0) { 3133 mutex_lock(&fs_info->chunk_mutex); 3134 btrfs_device_set_bytes_used(device, 3135 device->bytes_used - dev_extent_len); 3136 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3137 btrfs_clear_space_info_full(fs_info); 3138 mutex_unlock(&fs_info->chunk_mutex); 3139 } 3140 } 3141 mutex_unlock(&fs_devices->device_list_mutex); 3142 3143 /* 3144 * We acquire fs_info->chunk_mutex for 2 reasons: 3145 * 3146 * 1) Just like with the first phase of the chunk allocation, we must 3147 * reserve system space, do all chunk btree updates and deletions, and 3148 * update the system chunk array in the superblock while holding this 3149 * mutex. This is for similar reasons as explained on the comment at 3150 * the top of btrfs_chunk_alloc(); 3151 * 3152 * 2) Prevent races with the final phase of a device replace operation 3153 * that replaces the device object associated with the map's stripes, 3154 * because the device object's id can change at any time during that 3155 * final phase of the device replace operation 3156 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3157 * replaced device and then see it with an ID of 3158 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3159 * the device item, which does not exists on the chunk btree. 3160 * The finishing phase of device replace acquires both the 3161 * device_list_mutex and the chunk_mutex, in that order, so we are 3162 * safe by just acquiring the chunk_mutex. 3163 */ 3164 trans->removing_chunk = true; 3165 mutex_lock(&fs_info->chunk_mutex); 3166 3167 check_system_chunk(trans, map->type); 3168 3169 ret = remove_chunk_item(trans, map, chunk_offset); 3170 /* 3171 * Normally we should not get -ENOSPC since we reserved space before 3172 * through the call to check_system_chunk(). 3173 * 3174 * Despite our system space_info having enough free space, we may not 3175 * be able to allocate extents from its block groups, because all have 3176 * an incompatible profile, which will force us to allocate a new system 3177 * block group with the right profile, or right after we called 3178 * check_system_space() above, a scrub turned the only system block group 3179 * with enough free space into RO mode. 3180 * This is explained with more detail at do_chunk_alloc(). 3181 * 3182 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3183 */ 3184 if (ret == -ENOSPC) { 3185 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3186 struct btrfs_block_group *sys_bg; 3187 3188 sys_bg = btrfs_create_chunk(trans, sys_flags); 3189 if (IS_ERR(sys_bg)) { 3190 ret = PTR_ERR(sys_bg); 3191 btrfs_abort_transaction(trans, ret); 3192 goto out; 3193 } 3194 3195 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3196 if (ret) { 3197 btrfs_abort_transaction(trans, ret); 3198 goto out; 3199 } 3200 3201 ret = remove_chunk_item(trans, map, chunk_offset); 3202 if (ret) { 3203 btrfs_abort_transaction(trans, ret); 3204 goto out; 3205 } 3206 } else if (ret) { 3207 btrfs_abort_transaction(trans, ret); 3208 goto out; 3209 } 3210 3211 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3212 3213 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3214 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3215 if (ret) { 3216 btrfs_abort_transaction(trans, ret); 3217 goto out; 3218 } 3219 } 3220 3221 mutex_unlock(&fs_info->chunk_mutex); 3222 trans->removing_chunk = false; 3223 3224 /* 3225 * We are done with chunk btree updates and deletions, so release the 3226 * system space we previously reserved (with check_system_chunk()). 3227 */ 3228 btrfs_trans_release_chunk_metadata(trans); 3229 3230 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3231 if (ret) { 3232 btrfs_abort_transaction(trans, ret); 3233 goto out; 3234 } 3235 3236 out: 3237 if (trans->removing_chunk) { 3238 mutex_unlock(&fs_info->chunk_mutex); 3239 trans->removing_chunk = false; 3240 } 3241 /* once for us */ 3242 free_extent_map(em); 3243 return ret; 3244 } 3245 3246 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3247 { 3248 struct btrfs_root *root = fs_info->chunk_root; 3249 struct btrfs_trans_handle *trans; 3250 struct btrfs_block_group *block_group; 3251 u64 length; 3252 int ret; 3253 3254 /* 3255 * Prevent races with automatic removal of unused block groups. 3256 * After we relocate and before we remove the chunk with offset 3257 * chunk_offset, automatic removal of the block group can kick in, 3258 * resulting in a failure when calling btrfs_remove_chunk() below. 3259 * 3260 * Make sure to acquire this mutex before doing a tree search (dev 3261 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3262 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3263 * we release the path used to search the chunk/dev tree and before 3264 * the current task acquires this mutex and calls us. 3265 */ 3266 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3267 3268 /* step one, relocate all the extents inside this chunk */ 3269 btrfs_scrub_pause(fs_info); 3270 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3271 btrfs_scrub_continue(fs_info); 3272 if (ret) 3273 return ret; 3274 3275 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3276 if (!block_group) 3277 return -ENOENT; 3278 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3279 length = block_group->length; 3280 btrfs_put_block_group(block_group); 3281 3282 /* 3283 * On a zoned file system, discard the whole block group, this will 3284 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3285 * resetting the zone fails, don't treat it as a fatal problem from the 3286 * filesystem's point of view. 3287 */ 3288 if (btrfs_is_zoned(fs_info)) { 3289 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3290 if (ret) 3291 btrfs_info(fs_info, 3292 "failed to reset zone %llu after relocation", 3293 chunk_offset); 3294 } 3295 3296 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3297 chunk_offset); 3298 if (IS_ERR(trans)) { 3299 ret = PTR_ERR(trans); 3300 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3301 return ret; 3302 } 3303 3304 /* 3305 * step two, delete the device extents and the 3306 * chunk tree entries 3307 */ 3308 ret = btrfs_remove_chunk(trans, chunk_offset); 3309 btrfs_end_transaction(trans); 3310 return ret; 3311 } 3312 3313 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3314 { 3315 struct btrfs_root *chunk_root = fs_info->chunk_root; 3316 struct btrfs_path *path; 3317 struct extent_buffer *leaf; 3318 struct btrfs_chunk *chunk; 3319 struct btrfs_key key; 3320 struct btrfs_key found_key; 3321 u64 chunk_type; 3322 bool retried = false; 3323 int failed = 0; 3324 int ret; 3325 3326 path = btrfs_alloc_path(); 3327 if (!path) 3328 return -ENOMEM; 3329 3330 again: 3331 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3332 key.offset = (u64)-1; 3333 key.type = BTRFS_CHUNK_ITEM_KEY; 3334 3335 while (1) { 3336 mutex_lock(&fs_info->reclaim_bgs_lock); 3337 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3338 if (ret < 0) { 3339 mutex_unlock(&fs_info->reclaim_bgs_lock); 3340 goto error; 3341 } 3342 BUG_ON(ret == 0); /* Corruption */ 3343 3344 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3345 key.type); 3346 if (ret) 3347 mutex_unlock(&fs_info->reclaim_bgs_lock); 3348 if (ret < 0) 3349 goto error; 3350 if (ret > 0) 3351 break; 3352 3353 leaf = path->nodes[0]; 3354 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3355 3356 chunk = btrfs_item_ptr(leaf, path->slots[0], 3357 struct btrfs_chunk); 3358 chunk_type = btrfs_chunk_type(leaf, chunk); 3359 btrfs_release_path(path); 3360 3361 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3362 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3363 if (ret == -ENOSPC) 3364 failed++; 3365 else 3366 BUG_ON(ret); 3367 } 3368 mutex_unlock(&fs_info->reclaim_bgs_lock); 3369 3370 if (found_key.offset == 0) 3371 break; 3372 key.offset = found_key.offset - 1; 3373 } 3374 ret = 0; 3375 if (failed && !retried) { 3376 failed = 0; 3377 retried = true; 3378 goto again; 3379 } else if (WARN_ON(failed && retried)) { 3380 ret = -ENOSPC; 3381 } 3382 error: 3383 btrfs_free_path(path); 3384 return ret; 3385 } 3386 3387 /* 3388 * return 1 : allocate a data chunk successfully, 3389 * return <0: errors during allocating a data chunk, 3390 * return 0 : no need to allocate a data chunk. 3391 */ 3392 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3393 u64 chunk_offset) 3394 { 3395 struct btrfs_block_group *cache; 3396 u64 bytes_used; 3397 u64 chunk_type; 3398 3399 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3400 ASSERT(cache); 3401 chunk_type = cache->flags; 3402 btrfs_put_block_group(cache); 3403 3404 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3405 return 0; 3406 3407 spin_lock(&fs_info->data_sinfo->lock); 3408 bytes_used = fs_info->data_sinfo->bytes_used; 3409 spin_unlock(&fs_info->data_sinfo->lock); 3410 3411 if (!bytes_used) { 3412 struct btrfs_trans_handle *trans; 3413 int ret; 3414 3415 trans = btrfs_join_transaction(fs_info->tree_root); 3416 if (IS_ERR(trans)) 3417 return PTR_ERR(trans); 3418 3419 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3420 btrfs_end_transaction(trans); 3421 if (ret < 0) 3422 return ret; 3423 return 1; 3424 } 3425 3426 return 0; 3427 } 3428 3429 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3430 struct btrfs_balance_control *bctl) 3431 { 3432 struct btrfs_root *root = fs_info->tree_root; 3433 struct btrfs_trans_handle *trans; 3434 struct btrfs_balance_item *item; 3435 struct btrfs_disk_balance_args disk_bargs; 3436 struct btrfs_path *path; 3437 struct extent_buffer *leaf; 3438 struct btrfs_key key; 3439 int ret, err; 3440 3441 path = btrfs_alloc_path(); 3442 if (!path) 3443 return -ENOMEM; 3444 3445 trans = btrfs_start_transaction(root, 0); 3446 if (IS_ERR(trans)) { 3447 btrfs_free_path(path); 3448 return PTR_ERR(trans); 3449 } 3450 3451 key.objectid = BTRFS_BALANCE_OBJECTID; 3452 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3453 key.offset = 0; 3454 3455 ret = btrfs_insert_empty_item(trans, root, path, &key, 3456 sizeof(*item)); 3457 if (ret) 3458 goto out; 3459 3460 leaf = path->nodes[0]; 3461 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3462 3463 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3464 3465 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3466 btrfs_set_balance_data(leaf, item, &disk_bargs); 3467 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3468 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3469 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3470 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3471 3472 btrfs_set_balance_flags(leaf, item, bctl->flags); 3473 3474 btrfs_mark_buffer_dirty(leaf); 3475 out: 3476 btrfs_free_path(path); 3477 err = btrfs_commit_transaction(trans); 3478 if (err && !ret) 3479 ret = err; 3480 return ret; 3481 } 3482 3483 static int del_balance_item(struct btrfs_fs_info *fs_info) 3484 { 3485 struct btrfs_root *root = fs_info->tree_root; 3486 struct btrfs_trans_handle *trans; 3487 struct btrfs_path *path; 3488 struct btrfs_key key; 3489 int ret, err; 3490 3491 path = btrfs_alloc_path(); 3492 if (!path) 3493 return -ENOMEM; 3494 3495 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3496 if (IS_ERR(trans)) { 3497 btrfs_free_path(path); 3498 return PTR_ERR(trans); 3499 } 3500 3501 key.objectid = BTRFS_BALANCE_OBJECTID; 3502 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3503 key.offset = 0; 3504 3505 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3506 if (ret < 0) 3507 goto out; 3508 if (ret > 0) { 3509 ret = -ENOENT; 3510 goto out; 3511 } 3512 3513 ret = btrfs_del_item(trans, root, path); 3514 out: 3515 btrfs_free_path(path); 3516 err = btrfs_commit_transaction(trans); 3517 if (err && !ret) 3518 ret = err; 3519 return ret; 3520 } 3521 3522 /* 3523 * This is a heuristic used to reduce the number of chunks balanced on 3524 * resume after balance was interrupted. 3525 */ 3526 static void update_balance_args(struct btrfs_balance_control *bctl) 3527 { 3528 /* 3529 * Turn on soft mode for chunk types that were being converted. 3530 */ 3531 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3532 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3533 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3534 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3535 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3536 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3537 3538 /* 3539 * Turn on usage filter if is not already used. The idea is 3540 * that chunks that we have already balanced should be 3541 * reasonably full. Don't do it for chunks that are being 3542 * converted - that will keep us from relocating unconverted 3543 * (albeit full) chunks. 3544 */ 3545 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3546 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3547 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3548 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3549 bctl->data.usage = 90; 3550 } 3551 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3552 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3553 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3554 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3555 bctl->sys.usage = 90; 3556 } 3557 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3558 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3559 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3560 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3561 bctl->meta.usage = 90; 3562 } 3563 } 3564 3565 /* 3566 * Clear the balance status in fs_info and delete the balance item from disk. 3567 */ 3568 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3569 { 3570 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3571 int ret; 3572 3573 BUG_ON(!fs_info->balance_ctl); 3574 3575 spin_lock(&fs_info->balance_lock); 3576 fs_info->balance_ctl = NULL; 3577 spin_unlock(&fs_info->balance_lock); 3578 3579 kfree(bctl); 3580 ret = del_balance_item(fs_info); 3581 if (ret) 3582 btrfs_handle_fs_error(fs_info, ret, NULL); 3583 } 3584 3585 /* 3586 * Balance filters. Return 1 if chunk should be filtered out 3587 * (should not be balanced). 3588 */ 3589 static int chunk_profiles_filter(u64 chunk_type, 3590 struct btrfs_balance_args *bargs) 3591 { 3592 chunk_type = chunk_to_extended(chunk_type) & 3593 BTRFS_EXTENDED_PROFILE_MASK; 3594 3595 if (bargs->profiles & chunk_type) 3596 return 0; 3597 3598 return 1; 3599 } 3600 3601 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3602 struct btrfs_balance_args *bargs) 3603 { 3604 struct btrfs_block_group *cache; 3605 u64 chunk_used; 3606 u64 user_thresh_min; 3607 u64 user_thresh_max; 3608 int ret = 1; 3609 3610 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3611 chunk_used = cache->used; 3612 3613 if (bargs->usage_min == 0) 3614 user_thresh_min = 0; 3615 else 3616 user_thresh_min = div_factor_fine(cache->length, 3617 bargs->usage_min); 3618 3619 if (bargs->usage_max == 0) 3620 user_thresh_max = 1; 3621 else if (bargs->usage_max > 100) 3622 user_thresh_max = cache->length; 3623 else 3624 user_thresh_max = div_factor_fine(cache->length, 3625 bargs->usage_max); 3626 3627 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3628 ret = 0; 3629 3630 btrfs_put_block_group(cache); 3631 return ret; 3632 } 3633 3634 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3635 u64 chunk_offset, struct btrfs_balance_args *bargs) 3636 { 3637 struct btrfs_block_group *cache; 3638 u64 chunk_used, user_thresh; 3639 int ret = 1; 3640 3641 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3642 chunk_used = cache->used; 3643 3644 if (bargs->usage_min == 0) 3645 user_thresh = 1; 3646 else if (bargs->usage > 100) 3647 user_thresh = cache->length; 3648 else 3649 user_thresh = div_factor_fine(cache->length, bargs->usage); 3650 3651 if (chunk_used < user_thresh) 3652 ret = 0; 3653 3654 btrfs_put_block_group(cache); 3655 return ret; 3656 } 3657 3658 static int chunk_devid_filter(struct extent_buffer *leaf, 3659 struct btrfs_chunk *chunk, 3660 struct btrfs_balance_args *bargs) 3661 { 3662 struct btrfs_stripe *stripe; 3663 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3664 int i; 3665 3666 for (i = 0; i < num_stripes; i++) { 3667 stripe = btrfs_stripe_nr(chunk, i); 3668 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3669 return 0; 3670 } 3671 3672 return 1; 3673 } 3674 3675 static u64 calc_data_stripes(u64 type, int num_stripes) 3676 { 3677 const int index = btrfs_bg_flags_to_raid_index(type); 3678 const int ncopies = btrfs_raid_array[index].ncopies; 3679 const int nparity = btrfs_raid_array[index].nparity; 3680 3681 return (num_stripes - nparity) / ncopies; 3682 } 3683 3684 /* [pstart, pend) */ 3685 static int chunk_drange_filter(struct extent_buffer *leaf, 3686 struct btrfs_chunk *chunk, 3687 struct btrfs_balance_args *bargs) 3688 { 3689 struct btrfs_stripe *stripe; 3690 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3691 u64 stripe_offset; 3692 u64 stripe_length; 3693 u64 type; 3694 int factor; 3695 int i; 3696 3697 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3698 return 0; 3699 3700 type = btrfs_chunk_type(leaf, chunk); 3701 factor = calc_data_stripes(type, num_stripes); 3702 3703 for (i = 0; i < num_stripes; i++) { 3704 stripe = btrfs_stripe_nr(chunk, i); 3705 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3706 continue; 3707 3708 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3709 stripe_length = btrfs_chunk_length(leaf, chunk); 3710 stripe_length = div_u64(stripe_length, factor); 3711 3712 if (stripe_offset < bargs->pend && 3713 stripe_offset + stripe_length > bargs->pstart) 3714 return 0; 3715 } 3716 3717 return 1; 3718 } 3719 3720 /* [vstart, vend) */ 3721 static int chunk_vrange_filter(struct extent_buffer *leaf, 3722 struct btrfs_chunk *chunk, 3723 u64 chunk_offset, 3724 struct btrfs_balance_args *bargs) 3725 { 3726 if (chunk_offset < bargs->vend && 3727 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3728 /* at least part of the chunk is inside this vrange */ 3729 return 0; 3730 3731 return 1; 3732 } 3733 3734 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3735 struct btrfs_chunk *chunk, 3736 struct btrfs_balance_args *bargs) 3737 { 3738 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3739 3740 if (bargs->stripes_min <= num_stripes 3741 && num_stripes <= bargs->stripes_max) 3742 return 0; 3743 3744 return 1; 3745 } 3746 3747 static int chunk_soft_convert_filter(u64 chunk_type, 3748 struct btrfs_balance_args *bargs) 3749 { 3750 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3751 return 0; 3752 3753 chunk_type = chunk_to_extended(chunk_type) & 3754 BTRFS_EXTENDED_PROFILE_MASK; 3755 3756 if (bargs->target == chunk_type) 3757 return 1; 3758 3759 return 0; 3760 } 3761 3762 static int should_balance_chunk(struct extent_buffer *leaf, 3763 struct btrfs_chunk *chunk, u64 chunk_offset) 3764 { 3765 struct btrfs_fs_info *fs_info = leaf->fs_info; 3766 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3767 struct btrfs_balance_args *bargs = NULL; 3768 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3769 3770 /* type filter */ 3771 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3772 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3773 return 0; 3774 } 3775 3776 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3777 bargs = &bctl->data; 3778 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3779 bargs = &bctl->sys; 3780 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3781 bargs = &bctl->meta; 3782 3783 /* profiles filter */ 3784 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3785 chunk_profiles_filter(chunk_type, bargs)) { 3786 return 0; 3787 } 3788 3789 /* usage filter */ 3790 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3791 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3792 return 0; 3793 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3794 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3795 return 0; 3796 } 3797 3798 /* devid filter */ 3799 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3800 chunk_devid_filter(leaf, chunk, bargs)) { 3801 return 0; 3802 } 3803 3804 /* drange filter, makes sense only with devid filter */ 3805 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3806 chunk_drange_filter(leaf, chunk, bargs)) { 3807 return 0; 3808 } 3809 3810 /* vrange filter */ 3811 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3812 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3813 return 0; 3814 } 3815 3816 /* stripes filter */ 3817 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3818 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3819 return 0; 3820 } 3821 3822 /* soft profile changing mode */ 3823 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3824 chunk_soft_convert_filter(chunk_type, bargs)) { 3825 return 0; 3826 } 3827 3828 /* 3829 * limited by count, must be the last filter 3830 */ 3831 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3832 if (bargs->limit == 0) 3833 return 0; 3834 else 3835 bargs->limit--; 3836 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3837 /* 3838 * Same logic as the 'limit' filter; the minimum cannot be 3839 * determined here because we do not have the global information 3840 * about the count of all chunks that satisfy the filters. 3841 */ 3842 if (bargs->limit_max == 0) 3843 return 0; 3844 else 3845 bargs->limit_max--; 3846 } 3847 3848 return 1; 3849 } 3850 3851 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3852 { 3853 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3854 struct btrfs_root *chunk_root = fs_info->chunk_root; 3855 u64 chunk_type; 3856 struct btrfs_chunk *chunk; 3857 struct btrfs_path *path = NULL; 3858 struct btrfs_key key; 3859 struct btrfs_key found_key; 3860 struct extent_buffer *leaf; 3861 int slot; 3862 int ret; 3863 int enospc_errors = 0; 3864 bool counting = true; 3865 /* The single value limit and min/max limits use the same bytes in the */ 3866 u64 limit_data = bctl->data.limit; 3867 u64 limit_meta = bctl->meta.limit; 3868 u64 limit_sys = bctl->sys.limit; 3869 u32 count_data = 0; 3870 u32 count_meta = 0; 3871 u32 count_sys = 0; 3872 int chunk_reserved = 0; 3873 3874 path = btrfs_alloc_path(); 3875 if (!path) { 3876 ret = -ENOMEM; 3877 goto error; 3878 } 3879 3880 /* zero out stat counters */ 3881 spin_lock(&fs_info->balance_lock); 3882 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3883 spin_unlock(&fs_info->balance_lock); 3884 again: 3885 if (!counting) { 3886 /* 3887 * The single value limit and min/max limits use the same bytes 3888 * in the 3889 */ 3890 bctl->data.limit = limit_data; 3891 bctl->meta.limit = limit_meta; 3892 bctl->sys.limit = limit_sys; 3893 } 3894 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3895 key.offset = (u64)-1; 3896 key.type = BTRFS_CHUNK_ITEM_KEY; 3897 3898 while (1) { 3899 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3900 atomic_read(&fs_info->balance_cancel_req)) { 3901 ret = -ECANCELED; 3902 goto error; 3903 } 3904 3905 mutex_lock(&fs_info->reclaim_bgs_lock); 3906 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3907 if (ret < 0) { 3908 mutex_unlock(&fs_info->reclaim_bgs_lock); 3909 goto error; 3910 } 3911 3912 /* 3913 * this shouldn't happen, it means the last relocate 3914 * failed 3915 */ 3916 if (ret == 0) 3917 BUG(); /* FIXME break ? */ 3918 3919 ret = btrfs_previous_item(chunk_root, path, 0, 3920 BTRFS_CHUNK_ITEM_KEY); 3921 if (ret) { 3922 mutex_unlock(&fs_info->reclaim_bgs_lock); 3923 ret = 0; 3924 break; 3925 } 3926 3927 leaf = path->nodes[0]; 3928 slot = path->slots[0]; 3929 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3930 3931 if (found_key.objectid != key.objectid) { 3932 mutex_unlock(&fs_info->reclaim_bgs_lock); 3933 break; 3934 } 3935 3936 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3937 chunk_type = btrfs_chunk_type(leaf, chunk); 3938 3939 if (!counting) { 3940 spin_lock(&fs_info->balance_lock); 3941 bctl->stat.considered++; 3942 spin_unlock(&fs_info->balance_lock); 3943 } 3944 3945 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3946 3947 btrfs_release_path(path); 3948 if (!ret) { 3949 mutex_unlock(&fs_info->reclaim_bgs_lock); 3950 goto loop; 3951 } 3952 3953 if (counting) { 3954 mutex_unlock(&fs_info->reclaim_bgs_lock); 3955 spin_lock(&fs_info->balance_lock); 3956 bctl->stat.expected++; 3957 spin_unlock(&fs_info->balance_lock); 3958 3959 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3960 count_data++; 3961 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3962 count_sys++; 3963 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3964 count_meta++; 3965 3966 goto loop; 3967 } 3968 3969 /* 3970 * Apply limit_min filter, no need to check if the LIMITS 3971 * filter is used, limit_min is 0 by default 3972 */ 3973 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3974 count_data < bctl->data.limit_min) 3975 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3976 count_meta < bctl->meta.limit_min) 3977 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3978 count_sys < bctl->sys.limit_min)) { 3979 mutex_unlock(&fs_info->reclaim_bgs_lock); 3980 goto loop; 3981 } 3982 3983 if (!chunk_reserved) { 3984 /* 3985 * We may be relocating the only data chunk we have, 3986 * which could potentially end up with losing data's 3987 * raid profile, so lets allocate an empty one in 3988 * advance. 3989 */ 3990 ret = btrfs_may_alloc_data_chunk(fs_info, 3991 found_key.offset); 3992 if (ret < 0) { 3993 mutex_unlock(&fs_info->reclaim_bgs_lock); 3994 goto error; 3995 } else if (ret == 1) { 3996 chunk_reserved = 1; 3997 } 3998 } 3999 4000 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 4001 mutex_unlock(&fs_info->reclaim_bgs_lock); 4002 if (ret == -ENOSPC) { 4003 enospc_errors++; 4004 } else if (ret == -ETXTBSY) { 4005 btrfs_info(fs_info, 4006 "skipping relocation of block group %llu due to active swapfile", 4007 found_key.offset); 4008 ret = 0; 4009 } else if (ret) { 4010 goto error; 4011 } else { 4012 spin_lock(&fs_info->balance_lock); 4013 bctl->stat.completed++; 4014 spin_unlock(&fs_info->balance_lock); 4015 } 4016 loop: 4017 if (found_key.offset == 0) 4018 break; 4019 key.offset = found_key.offset - 1; 4020 } 4021 4022 if (counting) { 4023 btrfs_release_path(path); 4024 counting = false; 4025 goto again; 4026 } 4027 error: 4028 btrfs_free_path(path); 4029 if (enospc_errors) { 4030 btrfs_info(fs_info, "%d enospc errors during balance", 4031 enospc_errors); 4032 if (!ret) 4033 ret = -ENOSPC; 4034 } 4035 4036 return ret; 4037 } 4038 4039 /** 4040 * alloc_profile_is_valid - see if a given profile is valid and reduced 4041 * @flags: profile to validate 4042 * @extended: if true @flags is treated as an extended profile 4043 */ 4044 static int alloc_profile_is_valid(u64 flags, int extended) 4045 { 4046 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4047 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4048 4049 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4050 4051 /* 1) check that all other bits are zeroed */ 4052 if (flags & ~mask) 4053 return 0; 4054 4055 /* 2) see if profile is reduced */ 4056 if (flags == 0) 4057 return !extended; /* "0" is valid for usual profiles */ 4058 4059 return has_single_bit_set(flags); 4060 } 4061 4062 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4063 { 4064 /* cancel requested || normal exit path */ 4065 return atomic_read(&fs_info->balance_cancel_req) || 4066 (atomic_read(&fs_info->balance_pause_req) == 0 && 4067 atomic_read(&fs_info->balance_cancel_req) == 0); 4068 } 4069 4070 /* 4071 * Validate target profile against allowed profiles and return true if it's OK. 4072 * Otherwise print the error message and return false. 4073 */ 4074 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4075 const struct btrfs_balance_args *bargs, 4076 u64 allowed, const char *type) 4077 { 4078 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4079 return true; 4080 4081 if (fs_info->sectorsize < PAGE_SIZE && 4082 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 4083 btrfs_err(fs_info, 4084 "RAID56 is not yet supported for sectorsize %u with page size %lu", 4085 fs_info->sectorsize, PAGE_SIZE); 4086 return false; 4087 } 4088 /* Profile is valid and does not have bits outside of the allowed set */ 4089 if (alloc_profile_is_valid(bargs->target, 1) && 4090 (bargs->target & ~allowed) == 0) 4091 return true; 4092 4093 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4094 type, btrfs_bg_type_to_raid_name(bargs->target)); 4095 return false; 4096 } 4097 4098 /* 4099 * Fill @buf with textual description of balance filter flags @bargs, up to 4100 * @size_buf including the terminating null. The output may be trimmed if it 4101 * does not fit into the provided buffer. 4102 */ 4103 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4104 u32 size_buf) 4105 { 4106 int ret; 4107 u32 size_bp = size_buf; 4108 char *bp = buf; 4109 u64 flags = bargs->flags; 4110 char tmp_buf[128] = {'\0'}; 4111 4112 if (!flags) 4113 return; 4114 4115 #define CHECK_APPEND_NOARG(a) \ 4116 do { \ 4117 ret = snprintf(bp, size_bp, (a)); \ 4118 if (ret < 0 || ret >= size_bp) \ 4119 goto out_overflow; \ 4120 size_bp -= ret; \ 4121 bp += ret; \ 4122 } while (0) 4123 4124 #define CHECK_APPEND_1ARG(a, v1) \ 4125 do { \ 4126 ret = snprintf(bp, size_bp, (a), (v1)); \ 4127 if (ret < 0 || ret >= size_bp) \ 4128 goto out_overflow; \ 4129 size_bp -= ret; \ 4130 bp += ret; \ 4131 } while (0) 4132 4133 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4134 do { \ 4135 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4136 if (ret < 0 || ret >= size_bp) \ 4137 goto out_overflow; \ 4138 size_bp -= ret; \ 4139 bp += ret; \ 4140 } while (0) 4141 4142 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4143 CHECK_APPEND_1ARG("convert=%s,", 4144 btrfs_bg_type_to_raid_name(bargs->target)); 4145 4146 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4147 CHECK_APPEND_NOARG("soft,"); 4148 4149 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4150 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4151 sizeof(tmp_buf)); 4152 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4153 } 4154 4155 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4156 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4157 4158 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4159 CHECK_APPEND_2ARG("usage=%u..%u,", 4160 bargs->usage_min, bargs->usage_max); 4161 4162 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4163 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4164 4165 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4166 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4167 bargs->pstart, bargs->pend); 4168 4169 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4170 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4171 bargs->vstart, bargs->vend); 4172 4173 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4174 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4175 4176 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4177 CHECK_APPEND_2ARG("limit=%u..%u,", 4178 bargs->limit_min, bargs->limit_max); 4179 4180 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4181 CHECK_APPEND_2ARG("stripes=%u..%u,", 4182 bargs->stripes_min, bargs->stripes_max); 4183 4184 #undef CHECK_APPEND_2ARG 4185 #undef CHECK_APPEND_1ARG 4186 #undef CHECK_APPEND_NOARG 4187 4188 out_overflow: 4189 4190 if (size_bp < size_buf) 4191 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4192 else 4193 buf[0] = '\0'; 4194 } 4195 4196 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4197 { 4198 u32 size_buf = 1024; 4199 char tmp_buf[192] = {'\0'}; 4200 char *buf; 4201 char *bp; 4202 u32 size_bp = size_buf; 4203 int ret; 4204 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4205 4206 buf = kzalloc(size_buf, GFP_KERNEL); 4207 if (!buf) 4208 return; 4209 4210 bp = buf; 4211 4212 #define CHECK_APPEND_1ARG(a, v1) \ 4213 do { \ 4214 ret = snprintf(bp, size_bp, (a), (v1)); \ 4215 if (ret < 0 || ret >= size_bp) \ 4216 goto out_overflow; \ 4217 size_bp -= ret; \ 4218 bp += ret; \ 4219 } while (0) 4220 4221 if (bctl->flags & BTRFS_BALANCE_FORCE) 4222 CHECK_APPEND_1ARG("%s", "-f "); 4223 4224 if (bctl->flags & BTRFS_BALANCE_DATA) { 4225 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4226 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4227 } 4228 4229 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4230 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4231 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4232 } 4233 4234 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4235 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4236 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4237 } 4238 4239 #undef CHECK_APPEND_1ARG 4240 4241 out_overflow: 4242 4243 if (size_bp < size_buf) 4244 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4245 btrfs_info(fs_info, "balance: %s %s", 4246 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4247 "resume" : "start", buf); 4248 4249 kfree(buf); 4250 } 4251 4252 /* 4253 * Should be called with balance mutexe held 4254 */ 4255 int btrfs_balance(struct btrfs_fs_info *fs_info, 4256 struct btrfs_balance_control *bctl, 4257 struct btrfs_ioctl_balance_args *bargs) 4258 { 4259 u64 meta_target, data_target; 4260 u64 allowed; 4261 int mixed = 0; 4262 int ret; 4263 u64 num_devices; 4264 unsigned seq; 4265 bool reducing_redundancy; 4266 int i; 4267 4268 if (btrfs_fs_closing(fs_info) || 4269 atomic_read(&fs_info->balance_pause_req) || 4270 btrfs_should_cancel_balance(fs_info)) { 4271 ret = -EINVAL; 4272 goto out; 4273 } 4274 4275 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4276 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4277 mixed = 1; 4278 4279 /* 4280 * In case of mixed groups both data and meta should be picked, 4281 * and identical options should be given for both of them. 4282 */ 4283 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4284 if (mixed && (bctl->flags & allowed)) { 4285 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4286 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4287 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4288 btrfs_err(fs_info, 4289 "balance: mixed groups data and metadata options must be the same"); 4290 ret = -EINVAL; 4291 goto out; 4292 } 4293 } 4294 4295 /* 4296 * rw_devices will not change at the moment, device add/delete/replace 4297 * are exclusive 4298 */ 4299 num_devices = fs_info->fs_devices->rw_devices; 4300 4301 /* 4302 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4303 * special bit for it, to make it easier to distinguish. Thus we need 4304 * to set it manually, or balance would refuse the profile. 4305 */ 4306 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4307 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4308 if (num_devices >= btrfs_raid_array[i].devs_min) 4309 allowed |= btrfs_raid_array[i].bg_flag; 4310 4311 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4312 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4313 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4314 ret = -EINVAL; 4315 goto out; 4316 } 4317 4318 /* 4319 * Allow to reduce metadata or system integrity only if force set for 4320 * profiles with redundancy (copies, parity) 4321 */ 4322 allowed = 0; 4323 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4324 if (btrfs_raid_array[i].ncopies >= 2 || 4325 btrfs_raid_array[i].tolerated_failures >= 1) 4326 allowed |= btrfs_raid_array[i].bg_flag; 4327 } 4328 do { 4329 seq = read_seqbegin(&fs_info->profiles_lock); 4330 4331 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4332 (fs_info->avail_system_alloc_bits & allowed) && 4333 !(bctl->sys.target & allowed)) || 4334 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4335 (fs_info->avail_metadata_alloc_bits & allowed) && 4336 !(bctl->meta.target & allowed))) 4337 reducing_redundancy = true; 4338 else 4339 reducing_redundancy = false; 4340 4341 /* if we're not converting, the target field is uninitialized */ 4342 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4343 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4344 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4345 bctl->data.target : fs_info->avail_data_alloc_bits; 4346 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4347 4348 if (reducing_redundancy) { 4349 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4350 btrfs_info(fs_info, 4351 "balance: force reducing metadata redundancy"); 4352 } else { 4353 btrfs_err(fs_info, 4354 "balance: reduces metadata redundancy, use --force if you want this"); 4355 ret = -EINVAL; 4356 goto out; 4357 } 4358 } 4359 4360 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4361 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4362 btrfs_warn(fs_info, 4363 "balance: metadata profile %s has lower redundancy than data profile %s", 4364 btrfs_bg_type_to_raid_name(meta_target), 4365 btrfs_bg_type_to_raid_name(data_target)); 4366 } 4367 4368 ret = insert_balance_item(fs_info, bctl); 4369 if (ret && ret != -EEXIST) 4370 goto out; 4371 4372 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4373 BUG_ON(ret == -EEXIST); 4374 BUG_ON(fs_info->balance_ctl); 4375 spin_lock(&fs_info->balance_lock); 4376 fs_info->balance_ctl = bctl; 4377 spin_unlock(&fs_info->balance_lock); 4378 } else { 4379 BUG_ON(ret != -EEXIST); 4380 spin_lock(&fs_info->balance_lock); 4381 update_balance_args(bctl); 4382 spin_unlock(&fs_info->balance_lock); 4383 } 4384 4385 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4386 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4387 describe_balance_start_or_resume(fs_info); 4388 mutex_unlock(&fs_info->balance_mutex); 4389 4390 ret = __btrfs_balance(fs_info); 4391 4392 mutex_lock(&fs_info->balance_mutex); 4393 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4394 btrfs_info(fs_info, "balance: paused"); 4395 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4396 } 4397 /* 4398 * Balance can be canceled by: 4399 * 4400 * - Regular cancel request 4401 * Then ret == -ECANCELED and balance_cancel_req > 0 4402 * 4403 * - Fatal signal to "btrfs" process 4404 * Either the signal caught by wait_reserve_ticket() and callers 4405 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4406 * got -ECANCELED. 4407 * Either way, in this case balance_cancel_req = 0, and 4408 * ret == -EINTR or ret == -ECANCELED. 4409 * 4410 * So here we only check the return value to catch canceled balance. 4411 */ 4412 else if (ret == -ECANCELED || ret == -EINTR) 4413 btrfs_info(fs_info, "balance: canceled"); 4414 else 4415 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4416 4417 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4418 4419 if (bargs) { 4420 memset(bargs, 0, sizeof(*bargs)); 4421 btrfs_update_ioctl_balance_args(fs_info, bargs); 4422 } 4423 4424 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4425 balance_need_close(fs_info)) { 4426 reset_balance_state(fs_info); 4427 btrfs_exclop_finish(fs_info); 4428 } 4429 4430 wake_up(&fs_info->balance_wait_q); 4431 4432 return ret; 4433 out: 4434 if (bctl->flags & BTRFS_BALANCE_RESUME) 4435 reset_balance_state(fs_info); 4436 else 4437 kfree(bctl); 4438 btrfs_exclop_finish(fs_info); 4439 4440 return ret; 4441 } 4442 4443 static int balance_kthread(void *data) 4444 { 4445 struct btrfs_fs_info *fs_info = data; 4446 int ret = 0; 4447 4448 mutex_lock(&fs_info->balance_mutex); 4449 if (fs_info->balance_ctl) 4450 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4451 mutex_unlock(&fs_info->balance_mutex); 4452 4453 return ret; 4454 } 4455 4456 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4457 { 4458 struct task_struct *tsk; 4459 4460 mutex_lock(&fs_info->balance_mutex); 4461 if (!fs_info->balance_ctl) { 4462 mutex_unlock(&fs_info->balance_mutex); 4463 return 0; 4464 } 4465 mutex_unlock(&fs_info->balance_mutex); 4466 4467 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4468 btrfs_info(fs_info, "balance: resume skipped"); 4469 return 0; 4470 } 4471 4472 spin_lock(&fs_info->super_lock); 4473 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4474 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4475 spin_unlock(&fs_info->super_lock); 4476 /* 4477 * A ro->rw remount sequence should continue with the paused balance 4478 * regardless of who pauses it, system or the user as of now, so set 4479 * the resume flag. 4480 */ 4481 spin_lock(&fs_info->balance_lock); 4482 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4483 spin_unlock(&fs_info->balance_lock); 4484 4485 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4486 return PTR_ERR_OR_ZERO(tsk); 4487 } 4488 4489 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4490 { 4491 struct btrfs_balance_control *bctl; 4492 struct btrfs_balance_item *item; 4493 struct btrfs_disk_balance_args disk_bargs; 4494 struct btrfs_path *path; 4495 struct extent_buffer *leaf; 4496 struct btrfs_key key; 4497 int ret; 4498 4499 path = btrfs_alloc_path(); 4500 if (!path) 4501 return -ENOMEM; 4502 4503 key.objectid = BTRFS_BALANCE_OBJECTID; 4504 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4505 key.offset = 0; 4506 4507 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4508 if (ret < 0) 4509 goto out; 4510 if (ret > 0) { /* ret = -ENOENT; */ 4511 ret = 0; 4512 goto out; 4513 } 4514 4515 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4516 if (!bctl) { 4517 ret = -ENOMEM; 4518 goto out; 4519 } 4520 4521 leaf = path->nodes[0]; 4522 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4523 4524 bctl->flags = btrfs_balance_flags(leaf, item); 4525 bctl->flags |= BTRFS_BALANCE_RESUME; 4526 4527 btrfs_balance_data(leaf, item, &disk_bargs); 4528 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4529 btrfs_balance_meta(leaf, item, &disk_bargs); 4530 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4531 btrfs_balance_sys(leaf, item, &disk_bargs); 4532 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4533 4534 /* 4535 * This should never happen, as the paused balance state is recovered 4536 * during mount without any chance of other exclusive ops to collide. 4537 * 4538 * This gives the exclusive op status to balance and keeps in paused 4539 * state until user intervention (cancel or umount). If the ownership 4540 * cannot be assigned, show a message but do not fail. The balance 4541 * is in a paused state and must have fs_info::balance_ctl properly 4542 * set up. 4543 */ 4544 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4545 btrfs_warn(fs_info, 4546 "balance: cannot set exclusive op status, resume manually"); 4547 4548 btrfs_release_path(path); 4549 4550 mutex_lock(&fs_info->balance_mutex); 4551 BUG_ON(fs_info->balance_ctl); 4552 spin_lock(&fs_info->balance_lock); 4553 fs_info->balance_ctl = bctl; 4554 spin_unlock(&fs_info->balance_lock); 4555 mutex_unlock(&fs_info->balance_mutex); 4556 out: 4557 btrfs_free_path(path); 4558 return ret; 4559 } 4560 4561 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4562 { 4563 int ret = 0; 4564 4565 mutex_lock(&fs_info->balance_mutex); 4566 if (!fs_info->balance_ctl) { 4567 mutex_unlock(&fs_info->balance_mutex); 4568 return -ENOTCONN; 4569 } 4570 4571 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4572 atomic_inc(&fs_info->balance_pause_req); 4573 mutex_unlock(&fs_info->balance_mutex); 4574 4575 wait_event(fs_info->balance_wait_q, 4576 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4577 4578 mutex_lock(&fs_info->balance_mutex); 4579 /* we are good with balance_ctl ripped off from under us */ 4580 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4581 atomic_dec(&fs_info->balance_pause_req); 4582 } else { 4583 ret = -ENOTCONN; 4584 } 4585 4586 mutex_unlock(&fs_info->balance_mutex); 4587 return ret; 4588 } 4589 4590 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4591 { 4592 mutex_lock(&fs_info->balance_mutex); 4593 if (!fs_info->balance_ctl) { 4594 mutex_unlock(&fs_info->balance_mutex); 4595 return -ENOTCONN; 4596 } 4597 4598 /* 4599 * A paused balance with the item stored on disk can be resumed at 4600 * mount time if the mount is read-write. Otherwise it's still paused 4601 * and we must not allow cancelling as it deletes the item. 4602 */ 4603 if (sb_rdonly(fs_info->sb)) { 4604 mutex_unlock(&fs_info->balance_mutex); 4605 return -EROFS; 4606 } 4607 4608 atomic_inc(&fs_info->balance_cancel_req); 4609 /* 4610 * if we are running just wait and return, balance item is 4611 * deleted in btrfs_balance in this case 4612 */ 4613 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4614 mutex_unlock(&fs_info->balance_mutex); 4615 wait_event(fs_info->balance_wait_q, 4616 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4617 mutex_lock(&fs_info->balance_mutex); 4618 } else { 4619 mutex_unlock(&fs_info->balance_mutex); 4620 /* 4621 * Lock released to allow other waiters to continue, we'll 4622 * reexamine the status again. 4623 */ 4624 mutex_lock(&fs_info->balance_mutex); 4625 4626 if (fs_info->balance_ctl) { 4627 reset_balance_state(fs_info); 4628 btrfs_exclop_finish(fs_info); 4629 btrfs_info(fs_info, "balance: canceled"); 4630 } 4631 } 4632 4633 BUG_ON(fs_info->balance_ctl || 4634 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4635 atomic_dec(&fs_info->balance_cancel_req); 4636 mutex_unlock(&fs_info->balance_mutex); 4637 return 0; 4638 } 4639 4640 int btrfs_uuid_scan_kthread(void *data) 4641 { 4642 struct btrfs_fs_info *fs_info = data; 4643 struct btrfs_root *root = fs_info->tree_root; 4644 struct btrfs_key key; 4645 struct btrfs_path *path = NULL; 4646 int ret = 0; 4647 struct extent_buffer *eb; 4648 int slot; 4649 struct btrfs_root_item root_item; 4650 u32 item_size; 4651 struct btrfs_trans_handle *trans = NULL; 4652 bool closing = false; 4653 4654 path = btrfs_alloc_path(); 4655 if (!path) { 4656 ret = -ENOMEM; 4657 goto out; 4658 } 4659 4660 key.objectid = 0; 4661 key.type = BTRFS_ROOT_ITEM_KEY; 4662 key.offset = 0; 4663 4664 while (1) { 4665 if (btrfs_fs_closing(fs_info)) { 4666 closing = true; 4667 break; 4668 } 4669 ret = btrfs_search_forward(root, &key, path, 4670 BTRFS_OLDEST_GENERATION); 4671 if (ret) { 4672 if (ret > 0) 4673 ret = 0; 4674 break; 4675 } 4676 4677 if (key.type != BTRFS_ROOT_ITEM_KEY || 4678 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4679 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4680 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4681 goto skip; 4682 4683 eb = path->nodes[0]; 4684 slot = path->slots[0]; 4685 item_size = btrfs_item_size(eb, slot); 4686 if (item_size < sizeof(root_item)) 4687 goto skip; 4688 4689 read_extent_buffer(eb, &root_item, 4690 btrfs_item_ptr_offset(eb, slot), 4691 (int)sizeof(root_item)); 4692 if (btrfs_root_refs(&root_item) == 0) 4693 goto skip; 4694 4695 if (!btrfs_is_empty_uuid(root_item.uuid) || 4696 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4697 if (trans) 4698 goto update_tree; 4699 4700 btrfs_release_path(path); 4701 /* 4702 * 1 - subvol uuid item 4703 * 1 - received_subvol uuid item 4704 */ 4705 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4706 if (IS_ERR(trans)) { 4707 ret = PTR_ERR(trans); 4708 break; 4709 } 4710 continue; 4711 } else { 4712 goto skip; 4713 } 4714 update_tree: 4715 btrfs_release_path(path); 4716 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4717 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4718 BTRFS_UUID_KEY_SUBVOL, 4719 key.objectid); 4720 if (ret < 0) { 4721 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4722 ret); 4723 break; 4724 } 4725 } 4726 4727 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4728 ret = btrfs_uuid_tree_add(trans, 4729 root_item.received_uuid, 4730 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4731 key.objectid); 4732 if (ret < 0) { 4733 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4734 ret); 4735 break; 4736 } 4737 } 4738 4739 skip: 4740 btrfs_release_path(path); 4741 if (trans) { 4742 ret = btrfs_end_transaction(trans); 4743 trans = NULL; 4744 if (ret) 4745 break; 4746 } 4747 4748 if (key.offset < (u64)-1) { 4749 key.offset++; 4750 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4751 key.offset = 0; 4752 key.type = BTRFS_ROOT_ITEM_KEY; 4753 } else if (key.objectid < (u64)-1) { 4754 key.offset = 0; 4755 key.type = BTRFS_ROOT_ITEM_KEY; 4756 key.objectid++; 4757 } else { 4758 break; 4759 } 4760 cond_resched(); 4761 } 4762 4763 out: 4764 btrfs_free_path(path); 4765 if (trans && !IS_ERR(trans)) 4766 btrfs_end_transaction(trans); 4767 if (ret) 4768 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4769 else if (!closing) 4770 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4771 up(&fs_info->uuid_tree_rescan_sem); 4772 return 0; 4773 } 4774 4775 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4776 { 4777 struct btrfs_trans_handle *trans; 4778 struct btrfs_root *tree_root = fs_info->tree_root; 4779 struct btrfs_root *uuid_root; 4780 struct task_struct *task; 4781 int ret; 4782 4783 /* 4784 * 1 - root node 4785 * 1 - root item 4786 */ 4787 trans = btrfs_start_transaction(tree_root, 2); 4788 if (IS_ERR(trans)) 4789 return PTR_ERR(trans); 4790 4791 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4792 if (IS_ERR(uuid_root)) { 4793 ret = PTR_ERR(uuid_root); 4794 btrfs_abort_transaction(trans, ret); 4795 btrfs_end_transaction(trans); 4796 return ret; 4797 } 4798 4799 fs_info->uuid_root = uuid_root; 4800 4801 ret = btrfs_commit_transaction(trans); 4802 if (ret) 4803 return ret; 4804 4805 down(&fs_info->uuid_tree_rescan_sem); 4806 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4807 if (IS_ERR(task)) { 4808 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4809 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4810 up(&fs_info->uuid_tree_rescan_sem); 4811 return PTR_ERR(task); 4812 } 4813 4814 return 0; 4815 } 4816 4817 /* 4818 * shrinking a device means finding all of the device extents past 4819 * the new size, and then following the back refs to the chunks. 4820 * The chunk relocation code actually frees the device extent 4821 */ 4822 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4823 { 4824 struct btrfs_fs_info *fs_info = device->fs_info; 4825 struct btrfs_root *root = fs_info->dev_root; 4826 struct btrfs_trans_handle *trans; 4827 struct btrfs_dev_extent *dev_extent = NULL; 4828 struct btrfs_path *path; 4829 u64 length; 4830 u64 chunk_offset; 4831 int ret; 4832 int slot; 4833 int failed = 0; 4834 bool retried = false; 4835 struct extent_buffer *l; 4836 struct btrfs_key key; 4837 struct btrfs_super_block *super_copy = fs_info->super_copy; 4838 u64 old_total = btrfs_super_total_bytes(super_copy); 4839 u64 old_size = btrfs_device_get_total_bytes(device); 4840 u64 diff; 4841 u64 start; 4842 4843 new_size = round_down(new_size, fs_info->sectorsize); 4844 start = new_size; 4845 diff = round_down(old_size - new_size, fs_info->sectorsize); 4846 4847 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4848 return -EINVAL; 4849 4850 path = btrfs_alloc_path(); 4851 if (!path) 4852 return -ENOMEM; 4853 4854 path->reada = READA_BACK; 4855 4856 trans = btrfs_start_transaction(root, 0); 4857 if (IS_ERR(trans)) { 4858 btrfs_free_path(path); 4859 return PTR_ERR(trans); 4860 } 4861 4862 mutex_lock(&fs_info->chunk_mutex); 4863 4864 btrfs_device_set_total_bytes(device, new_size); 4865 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4866 device->fs_devices->total_rw_bytes -= diff; 4867 atomic64_sub(diff, &fs_info->free_chunk_space); 4868 } 4869 4870 /* 4871 * Once the device's size has been set to the new size, ensure all 4872 * in-memory chunks are synced to disk so that the loop below sees them 4873 * and relocates them accordingly. 4874 */ 4875 if (contains_pending_extent(device, &start, diff)) { 4876 mutex_unlock(&fs_info->chunk_mutex); 4877 ret = btrfs_commit_transaction(trans); 4878 if (ret) 4879 goto done; 4880 } else { 4881 mutex_unlock(&fs_info->chunk_mutex); 4882 btrfs_end_transaction(trans); 4883 } 4884 4885 again: 4886 key.objectid = device->devid; 4887 key.offset = (u64)-1; 4888 key.type = BTRFS_DEV_EXTENT_KEY; 4889 4890 do { 4891 mutex_lock(&fs_info->reclaim_bgs_lock); 4892 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4893 if (ret < 0) { 4894 mutex_unlock(&fs_info->reclaim_bgs_lock); 4895 goto done; 4896 } 4897 4898 ret = btrfs_previous_item(root, path, 0, key.type); 4899 if (ret) { 4900 mutex_unlock(&fs_info->reclaim_bgs_lock); 4901 if (ret < 0) 4902 goto done; 4903 ret = 0; 4904 btrfs_release_path(path); 4905 break; 4906 } 4907 4908 l = path->nodes[0]; 4909 slot = path->slots[0]; 4910 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4911 4912 if (key.objectid != device->devid) { 4913 mutex_unlock(&fs_info->reclaim_bgs_lock); 4914 btrfs_release_path(path); 4915 break; 4916 } 4917 4918 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4919 length = btrfs_dev_extent_length(l, dev_extent); 4920 4921 if (key.offset + length <= new_size) { 4922 mutex_unlock(&fs_info->reclaim_bgs_lock); 4923 btrfs_release_path(path); 4924 break; 4925 } 4926 4927 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4928 btrfs_release_path(path); 4929 4930 /* 4931 * We may be relocating the only data chunk we have, 4932 * which could potentially end up with losing data's 4933 * raid profile, so lets allocate an empty one in 4934 * advance. 4935 */ 4936 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4937 if (ret < 0) { 4938 mutex_unlock(&fs_info->reclaim_bgs_lock); 4939 goto done; 4940 } 4941 4942 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4943 mutex_unlock(&fs_info->reclaim_bgs_lock); 4944 if (ret == -ENOSPC) { 4945 failed++; 4946 } else if (ret) { 4947 if (ret == -ETXTBSY) { 4948 btrfs_warn(fs_info, 4949 "could not shrink block group %llu due to active swapfile", 4950 chunk_offset); 4951 } 4952 goto done; 4953 } 4954 } while (key.offset-- > 0); 4955 4956 if (failed && !retried) { 4957 failed = 0; 4958 retried = true; 4959 goto again; 4960 } else if (failed && retried) { 4961 ret = -ENOSPC; 4962 goto done; 4963 } 4964 4965 /* Shrinking succeeded, else we would be at "done". */ 4966 trans = btrfs_start_transaction(root, 0); 4967 if (IS_ERR(trans)) { 4968 ret = PTR_ERR(trans); 4969 goto done; 4970 } 4971 4972 mutex_lock(&fs_info->chunk_mutex); 4973 /* Clear all state bits beyond the shrunk device size */ 4974 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4975 CHUNK_STATE_MASK); 4976 4977 btrfs_device_set_disk_total_bytes(device, new_size); 4978 if (list_empty(&device->post_commit_list)) 4979 list_add_tail(&device->post_commit_list, 4980 &trans->transaction->dev_update_list); 4981 4982 WARN_ON(diff > old_total); 4983 btrfs_set_super_total_bytes(super_copy, 4984 round_down(old_total - diff, fs_info->sectorsize)); 4985 mutex_unlock(&fs_info->chunk_mutex); 4986 4987 btrfs_reserve_chunk_metadata(trans, false); 4988 /* Now btrfs_update_device() will change the on-disk size. */ 4989 ret = btrfs_update_device(trans, device); 4990 btrfs_trans_release_chunk_metadata(trans); 4991 if (ret < 0) { 4992 btrfs_abort_transaction(trans, ret); 4993 btrfs_end_transaction(trans); 4994 } else { 4995 ret = btrfs_commit_transaction(trans); 4996 } 4997 done: 4998 btrfs_free_path(path); 4999 if (ret) { 5000 mutex_lock(&fs_info->chunk_mutex); 5001 btrfs_device_set_total_bytes(device, old_size); 5002 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 5003 device->fs_devices->total_rw_bytes += diff; 5004 atomic64_add(diff, &fs_info->free_chunk_space); 5005 mutex_unlock(&fs_info->chunk_mutex); 5006 } 5007 return ret; 5008 } 5009 5010 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 5011 struct btrfs_key *key, 5012 struct btrfs_chunk *chunk, int item_size) 5013 { 5014 struct btrfs_super_block *super_copy = fs_info->super_copy; 5015 struct btrfs_disk_key disk_key; 5016 u32 array_size; 5017 u8 *ptr; 5018 5019 lockdep_assert_held(&fs_info->chunk_mutex); 5020 5021 array_size = btrfs_super_sys_array_size(super_copy); 5022 if (array_size + item_size + sizeof(disk_key) 5023 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5024 return -EFBIG; 5025 5026 ptr = super_copy->sys_chunk_array + array_size; 5027 btrfs_cpu_key_to_disk(&disk_key, key); 5028 memcpy(ptr, &disk_key, sizeof(disk_key)); 5029 ptr += sizeof(disk_key); 5030 memcpy(ptr, chunk, item_size); 5031 item_size += sizeof(disk_key); 5032 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5033 5034 return 0; 5035 } 5036 5037 /* 5038 * sort the devices in descending order by max_avail, total_avail 5039 */ 5040 static int btrfs_cmp_device_info(const void *a, const void *b) 5041 { 5042 const struct btrfs_device_info *di_a = a; 5043 const struct btrfs_device_info *di_b = b; 5044 5045 if (di_a->max_avail > di_b->max_avail) 5046 return -1; 5047 if (di_a->max_avail < di_b->max_avail) 5048 return 1; 5049 if (di_a->total_avail > di_b->total_avail) 5050 return -1; 5051 if (di_a->total_avail < di_b->total_avail) 5052 return 1; 5053 return 0; 5054 } 5055 5056 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5057 { 5058 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5059 return; 5060 5061 btrfs_set_fs_incompat(info, RAID56); 5062 } 5063 5064 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5065 { 5066 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5067 return; 5068 5069 btrfs_set_fs_incompat(info, RAID1C34); 5070 } 5071 5072 /* 5073 * Structure used internally for btrfs_create_chunk() function. 5074 * Wraps needed parameters. 5075 */ 5076 struct alloc_chunk_ctl { 5077 u64 start; 5078 u64 type; 5079 /* Total number of stripes to allocate */ 5080 int num_stripes; 5081 /* sub_stripes info for map */ 5082 int sub_stripes; 5083 /* Stripes per device */ 5084 int dev_stripes; 5085 /* Maximum number of devices to use */ 5086 int devs_max; 5087 /* Minimum number of devices to use */ 5088 int devs_min; 5089 /* ndevs has to be a multiple of this */ 5090 int devs_increment; 5091 /* Number of copies */ 5092 int ncopies; 5093 /* Number of stripes worth of bytes to store parity information */ 5094 int nparity; 5095 u64 max_stripe_size; 5096 u64 max_chunk_size; 5097 u64 dev_extent_min; 5098 u64 stripe_size; 5099 u64 chunk_size; 5100 int ndevs; 5101 }; 5102 5103 static void init_alloc_chunk_ctl_policy_regular( 5104 struct btrfs_fs_devices *fs_devices, 5105 struct alloc_chunk_ctl *ctl) 5106 { 5107 u64 type = ctl->type; 5108 5109 if (type & BTRFS_BLOCK_GROUP_DATA) { 5110 ctl->max_stripe_size = SZ_1G; 5111 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 5112 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5113 /* For larger filesystems, use larger metadata chunks */ 5114 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 5115 ctl->max_stripe_size = SZ_1G; 5116 else 5117 ctl->max_stripe_size = SZ_256M; 5118 ctl->max_chunk_size = ctl->max_stripe_size; 5119 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5120 ctl->max_stripe_size = SZ_32M; 5121 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5122 ctl->devs_max = min_t(int, ctl->devs_max, 5123 BTRFS_MAX_DEVS_SYS_CHUNK); 5124 } else { 5125 BUG(); 5126 } 5127 5128 /* We don't want a chunk larger than 10% of writable space */ 5129 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5130 ctl->max_chunk_size); 5131 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5132 } 5133 5134 static void init_alloc_chunk_ctl_policy_zoned( 5135 struct btrfs_fs_devices *fs_devices, 5136 struct alloc_chunk_ctl *ctl) 5137 { 5138 u64 zone_size = fs_devices->fs_info->zone_size; 5139 u64 limit; 5140 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5141 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5142 u64 min_chunk_size = min_data_stripes * zone_size; 5143 u64 type = ctl->type; 5144 5145 ctl->max_stripe_size = zone_size; 5146 if (type & BTRFS_BLOCK_GROUP_DATA) { 5147 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5148 zone_size); 5149 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5150 ctl->max_chunk_size = ctl->max_stripe_size; 5151 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5152 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5153 ctl->devs_max = min_t(int, ctl->devs_max, 5154 BTRFS_MAX_DEVS_SYS_CHUNK); 5155 } else { 5156 BUG(); 5157 } 5158 5159 /* We don't want a chunk larger than 10% of writable space */ 5160 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5161 zone_size), 5162 min_chunk_size); 5163 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5164 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5165 } 5166 5167 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5168 struct alloc_chunk_ctl *ctl) 5169 { 5170 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5171 5172 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5173 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5174 ctl->devs_max = btrfs_raid_array[index].devs_max; 5175 if (!ctl->devs_max) 5176 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5177 ctl->devs_min = btrfs_raid_array[index].devs_min; 5178 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5179 ctl->ncopies = btrfs_raid_array[index].ncopies; 5180 ctl->nparity = btrfs_raid_array[index].nparity; 5181 ctl->ndevs = 0; 5182 5183 switch (fs_devices->chunk_alloc_policy) { 5184 case BTRFS_CHUNK_ALLOC_REGULAR: 5185 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5186 break; 5187 case BTRFS_CHUNK_ALLOC_ZONED: 5188 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5189 break; 5190 default: 5191 BUG(); 5192 } 5193 } 5194 5195 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5196 struct alloc_chunk_ctl *ctl, 5197 struct btrfs_device_info *devices_info) 5198 { 5199 struct btrfs_fs_info *info = fs_devices->fs_info; 5200 struct btrfs_device *device; 5201 u64 total_avail; 5202 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5203 int ret; 5204 int ndevs = 0; 5205 u64 max_avail; 5206 u64 dev_offset; 5207 5208 /* 5209 * in the first pass through the devices list, we gather information 5210 * about the available holes on each device. 5211 */ 5212 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5213 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5214 WARN(1, KERN_ERR 5215 "BTRFS: read-only device in alloc_list\n"); 5216 continue; 5217 } 5218 5219 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5220 &device->dev_state) || 5221 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5222 continue; 5223 5224 if (device->total_bytes > device->bytes_used) 5225 total_avail = device->total_bytes - device->bytes_used; 5226 else 5227 total_avail = 0; 5228 5229 /* If there is no space on this device, skip it. */ 5230 if (total_avail < ctl->dev_extent_min) 5231 continue; 5232 5233 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5234 &max_avail); 5235 if (ret && ret != -ENOSPC) 5236 return ret; 5237 5238 if (ret == 0) 5239 max_avail = dev_extent_want; 5240 5241 if (max_avail < ctl->dev_extent_min) { 5242 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5243 btrfs_debug(info, 5244 "%s: devid %llu has no free space, have=%llu want=%llu", 5245 __func__, device->devid, max_avail, 5246 ctl->dev_extent_min); 5247 continue; 5248 } 5249 5250 if (ndevs == fs_devices->rw_devices) { 5251 WARN(1, "%s: found more than %llu devices\n", 5252 __func__, fs_devices->rw_devices); 5253 break; 5254 } 5255 devices_info[ndevs].dev_offset = dev_offset; 5256 devices_info[ndevs].max_avail = max_avail; 5257 devices_info[ndevs].total_avail = total_avail; 5258 devices_info[ndevs].dev = device; 5259 ++ndevs; 5260 } 5261 ctl->ndevs = ndevs; 5262 5263 /* 5264 * now sort the devices by hole size / available space 5265 */ 5266 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5267 btrfs_cmp_device_info, NULL); 5268 5269 return 0; 5270 } 5271 5272 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5273 struct btrfs_device_info *devices_info) 5274 { 5275 /* Number of stripes that count for block group size */ 5276 int data_stripes; 5277 5278 /* 5279 * The primary goal is to maximize the number of stripes, so use as 5280 * many devices as possible, even if the stripes are not maximum sized. 5281 * 5282 * The DUP profile stores more than one stripe per device, the 5283 * max_avail is the total size so we have to adjust. 5284 */ 5285 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5286 ctl->dev_stripes); 5287 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5288 5289 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5290 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5291 5292 /* 5293 * Use the number of data stripes to figure out how big this chunk is 5294 * really going to be in terms of logical address space, and compare 5295 * that answer with the max chunk size. If it's higher, we try to 5296 * reduce stripe_size. 5297 */ 5298 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5299 /* 5300 * Reduce stripe_size, round it up to a 16MB boundary again and 5301 * then use it, unless it ends up being even bigger than the 5302 * previous value we had already. 5303 */ 5304 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5305 data_stripes), SZ_16M), 5306 ctl->stripe_size); 5307 } 5308 5309 /* Align to BTRFS_STRIPE_LEN */ 5310 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5311 ctl->chunk_size = ctl->stripe_size * data_stripes; 5312 5313 return 0; 5314 } 5315 5316 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5317 struct btrfs_device_info *devices_info) 5318 { 5319 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5320 /* Number of stripes that count for block group size */ 5321 int data_stripes; 5322 5323 /* 5324 * It should hold because: 5325 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5326 */ 5327 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5328 5329 ctl->stripe_size = zone_size; 5330 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5331 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5332 5333 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5334 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5335 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5336 ctl->stripe_size) + ctl->nparity, 5337 ctl->dev_stripes); 5338 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5339 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5340 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5341 } 5342 5343 ctl->chunk_size = ctl->stripe_size * data_stripes; 5344 5345 return 0; 5346 } 5347 5348 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5349 struct alloc_chunk_ctl *ctl, 5350 struct btrfs_device_info *devices_info) 5351 { 5352 struct btrfs_fs_info *info = fs_devices->fs_info; 5353 5354 /* 5355 * Round down to number of usable stripes, devs_increment can be any 5356 * number so we can't use round_down() that requires power of 2, while 5357 * rounddown is safe. 5358 */ 5359 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5360 5361 if (ctl->ndevs < ctl->devs_min) { 5362 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5363 btrfs_debug(info, 5364 "%s: not enough devices with free space: have=%d minimum required=%d", 5365 __func__, ctl->ndevs, ctl->devs_min); 5366 } 5367 return -ENOSPC; 5368 } 5369 5370 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5371 5372 switch (fs_devices->chunk_alloc_policy) { 5373 case BTRFS_CHUNK_ALLOC_REGULAR: 5374 return decide_stripe_size_regular(ctl, devices_info); 5375 case BTRFS_CHUNK_ALLOC_ZONED: 5376 return decide_stripe_size_zoned(ctl, devices_info); 5377 default: 5378 BUG(); 5379 } 5380 } 5381 5382 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5383 struct alloc_chunk_ctl *ctl, 5384 struct btrfs_device_info *devices_info) 5385 { 5386 struct btrfs_fs_info *info = trans->fs_info; 5387 struct map_lookup *map = NULL; 5388 struct extent_map_tree *em_tree; 5389 struct btrfs_block_group *block_group; 5390 struct extent_map *em; 5391 u64 start = ctl->start; 5392 u64 type = ctl->type; 5393 int ret; 5394 int i; 5395 int j; 5396 5397 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5398 if (!map) 5399 return ERR_PTR(-ENOMEM); 5400 map->num_stripes = ctl->num_stripes; 5401 5402 for (i = 0; i < ctl->ndevs; ++i) { 5403 for (j = 0; j < ctl->dev_stripes; ++j) { 5404 int s = i * ctl->dev_stripes + j; 5405 map->stripes[s].dev = devices_info[i].dev; 5406 map->stripes[s].physical = devices_info[i].dev_offset + 5407 j * ctl->stripe_size; 5408 } 5409 } 5410 map->stripe_len = BTRFS_STRIPE_LEN; 5411 map->io_align = BTRFS_STRIPE_LEN; 5412 map->io_width = BTRFS_STRIPE_LEN; 5413 map->type = type; 5414 map->sub_stripes = ctl->sub_stripes; 5415 5416 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5417 5418 em = alloc_extent_map(); 5419 if (!em) { 5420 kfree(map); 5421 return ERR_PTR(-ENOMEM); 5422 } 5423 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5424 em->map_lookup = map; 5425 em->start = start; 5426 em->len = ctl->chunk_size; 5427 em->block_start = 0; 5428 em->block_len = em->len; 5429 em->orig_block_len = ctl->stripe_size; 5430 5431 em_tree = &info->mapping_tree; 5432 write_lock(&em_tree->lock); 5433 ret = add_extent_mapping(em_tree, em, 0); 5434 if (ret) { 5435 write_unlock(&em_tree->lock); 5436 free_extent_map(em); 5437 return ERR_PTR(ret); 5438 } 5439 write_unlock(&em_tree->lock); 5440 5441 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5442 if (IS_ERR(block_group)) 5443 goto error_del_extent; 5444 5445 for (i = 0; i < map->num_stripes; i++) { 5446 struct btrfs_device *dev = map->stripes[i].dev; 5447 5448 btrfs_device_set_bytes_used(dev, 5449 dev->bytes_used + ctl->stripe_size); 5450 if (list_empty(&dev->post_commit_list)) 5451 list_add_tail(&dev->post_commit_list, 5452 &trans->transaction->dev_update_list); 5453 } 5454 5455 atomic64_sub(ctl->stripe_size * map->num_stripes, 5456 &info->free_chunk_space); 5457 5458 free_extent_map(em); 5459 check_raid56_incompat_flag(info, type); 5460 check_raid1c34_incompat_flag(info, type); 5461 5462 return block_group; 5463 5464 error_del_extent: 5465 write_lock(&em_tree->lock); 5466 remove_extent_mapping(em_tree, em); 5467 write_unlock(&em_tree->lock); 5468 5469 /* One for our allocation */ 5470 free_extent_map(em); 5471 /* One for the tree reference */ 5472 free_extent_map(em); 5473 5474 return block_group; 5475 } 5476 5477 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5478 u64 type) 5479 { 5480 struct btrfs_fs_info *info = trans->fs_info; 5481 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5482 struct btrfs_device_info *devices_info = NULL; 5483 struct alloc_chunk_ctl ctl; 5484 struct btrfs_block_group *block_group; 5485 int ret; 5486 5487 lockdep_assert_held(&info->chunk_mutex); 5488 5489 if (!alloc_profile_is_valid(type, 0)) { 5490 ASSERT(0); 5491 return ERR_PTR(-EINVAL); 5492 } 5493 5494 if (list_empty(&fs_devices->alloc_list)) { 5495 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5496 btrfs_debug(info, "%s: no writable device", __func__); 5497 return ERR_PTR(-ENOSPC); 5498 } 5499 5500 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5501 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5502 ASSERT(0); 5503 return ERR_PTR(-EINVAL); 5504 } 5505 5506 ctl.start = find_next_chunk(info); 5507 ctl.type = type; 5508 init_alloc_chunk_ctl(fs_devices, &ctl); 5509 5510 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5511 GFP_NOFS); 5512 if (!devices_info) 5513 return ERR_PTR(-ENOMEM); 5514 5515 ret = gather_device_info(fs_devices, &ctl, devices_info); 5516 if (ret < 0) { 5517 block_group = ERR_PTR(ret); 5518 goto out; 5519 } 5520 5521 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5522 if (ret < 0) { 5523 block_group = ERR_PTR(ret); 5524 goto out; 5525 } 5526 5527 block_group = create_chunk(trans, &ctl, devices_info); 5528 5529 out: 5530 kfree(devices_info); 5531 return block_group; 5532 } 5533 5534 /* 5535 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5536 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5537 * chunks. 5538 * 5539 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5540 * phases. 5541 */ 5542 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5543 struct btrfs_block_group *bg) 5544 { 5545 struct btrfs_fs_info *fs_info = trans->fs_info; 5546 struct btrfs_root *chunk_root = fs_info->chunk_root; 5547 struct btrfs_key key; 5548 struct btrfs_chunk *chunk; 5549 struct btrfs_stripe *stripe; 5550 struct extent_map *em; 5551 struct map_lookup *map; 5552 size_t item_size; 5553 int i; 5554 int ret; 5555 5556 /* 5557 * We take the chunk_mutex for 2 reasons: 5558 * 5559 * 1) Updates and insertions in the chunk btree must be done while holding 5560 * the chunk_mutex, as well as updating the system chunk array in the 5561 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5562 * details; 5563 * 5564 * 2) To prevent races with the final phase of a device replace operation 5565 * that replaces the device object associated with the map's stripes, 5566 * because the device object's id can change at any time during that 5567 * final phase of the device replace operation 5568 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5569 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5570 * which would cause a failure when updating the device item, which does 5571 * not exists, or persisting a stripe of the chunk item with such ID. 5572 * Here we can't use the device_list_mutex because our caller already 5573 * has locked the chunk_mutex, and the final phase of device replace 5574 * acquires both mutexes - first the device_list_mutex and then the 5575 * chunk_mutex. Using any of those two mutexes protects us from a 5576 * concurrent device replace. 5577 */ 5578 lockdep_assert_held(&fs_info->chunk_mutex); 5579 5580 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5581 if (IS_ERR(em)) { 5582 ret = PTR_ERR(em); 5583 btrfs_abort_transaction(trans, ret); 5584 return ret; 5585 } 5586 5587 map = em->map_lookup; 5588 item_size = btrfs_chunk_item_size(map->num_stripes); 5589 5590 chunk = kzalloc(item_size, GFP_NOFS); 5591 if (!chunk) { 5592 ret = -ENOMEM; 5593 btrfs_abort_transaction(trans, ret); 5594 goto out; 5595 } 5596 5597 for (i = 0; i < map->num_stripes; i++) { 5598 struct btrfs_device *device = map->stripes[i].dev; 5599 5600 ret = btrfs_update_device(trans, device); 5601 if (ret) 5602 goto out; 5603 } 5604 5605 stripe = &chunk->stripe; 5606 for (i = 0; i < map->num_stripes; i++) { 5607 struct btrfs_device *device = map->stripes[i].dev; 5608 const u64 dev_offset = map->stripes[i].physical; 5609 5610 btrfs_set_stack_stripe_devid(stripe, device->devid); 5611 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5612 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5613 stripe++; 5614 } 5615 5616 btrfs_set_stack_chunk_length(chunk, bg->length); 5617 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5618 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5619 btrfs_set_stack_chunk_type(chunk, map->type); 5620 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5621 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5622 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5623 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5624 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5625 5626 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5627 key.type = BTRFS_CHUNK_ITEM_KEY; 5628 key.offset = bg->start; 5629 5630 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5631 if (ret) 5632 goto out; 5633 5634 bg->chunk_item_inserted = 1; 5635 5636 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5637 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5638 if (ret) 5639 goto out; 5640 } 5641 5642 out: 5643 kfree(chunk); 5644 free_extent_map(em); 5645 return ret; 5646 } 5647 5648 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5649 { 5650 struct btrfs_fs_info *fs_info = trans->fs_info; 5651 u64 alloc_profile; 5652 struct btrfs_block_group *meta_bg; 5653 struct btrfs_block_group *sys_bg; 5654 5655 /* 5656 * When adding a new device for sprouting, the seed device is read-only 5657 * so we must first allocate a metadata and a system chunk. But before 5658 * adding the block group items to the extent, device and chunk btrees, 5659 * we must first: 5660 * 5661 * 1) Create both chunks without doing any changes to the btrees, as 5662 * otherwise we would get -ENOSPC since the block groups from the 5663 * seed device are read-only; 5664 * 5665 * 2) Add the device item for the new sprout device - finishing the setup 5666 * of a new block group requires updating the device item in the chunk 5667 * btree, so it must exist when we attempt to do it. The previous step 5668 * ensures this does not fail with -ENOSPC. 5669 * 5670 * After that we can add the block group items to their btrees: 5671 * update existing device item in the chunk btree, add a new block group 5672 * item to the extent btree, add a new chunk item to the chunk btree and 5673 * finally add the new device extent items to the devices btree. 5674 */ 5675 5676 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5677 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5678 if (IS_ERR(meta_bg)) 5679 return PTR_ERR(meta_bg); 5680 5681 alloc_profile = btrfs_system_alloc_profile(fs_info); 5682 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5683 if (IS_ERR(sys_bg)) 5684 return PTR_ERR(sys_bg); 5685 5686 return 0; 5687 } 5688 5689 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5690 { 5691 const int index = btrfs_bg_flags_to_raid_index(map->type); 5692 5693 return btrfs_raid_array[index].tolerated_failures; 5694 } 5695 5696 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5697 { 5698 struct extent_map *em; 5699 struct map_lookup *map; 5700 int miss_ndevs = 0; 5701 int i; 5702 bool ret = true; 5703 5704 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5705 if (IS_ERR(em)) 5706 return false; 5707 5708 map = em->map_lookup; 5709 for (i = 0; i < map->num_stripes; i++) { 5710 if (test_bit(BTRFS_DEV_STATE_MISSING, 5711 &map->stripes[i].dev->dev_state)) { 5712 miss_ndevs++; 5713 continue; 5714 } 5715 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5716 &map->stripes[i].dev->dev_state)) { 5717 ret = false; 5718 goto end; 5719 } 5720 } 5721 5722 /* 5723 * If the number of missing devices is larger than max errors, we can 5724 * not write the data into that chunk successfully. 5725 */ 5726 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5727 ret = false; 5728 end: 5729 free_extent_map(em); 5730 return ret; 5731 } 5732 5733 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5734 { 5735 struct extent_map *em; 5736 5737 while (1) { 5738 write_lock(&tree->lock); 5739 em = lookup_extent_mapping(tree, 0, (u64)-1); 5740 if (em) 5741 remove_extent_mapping(tree, em); 5742 write_unlock(&tree->lock); 5743 if (!em) 5744 break; 5745 /* once for us */ 5746 free_extent_map(em); 5747 /* once for the tree */ 5748 free_extent_map(em); 5749 } 5750 } 5751 5752 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5753 { 5754 struct extent_map *em; 5755 struct map_lookup *map; 5756 int ret; 5757 5758 em = btrfs_get_chunk_map(fs_info, logical, len); 5759 if (IS_ERR(em)) 5760 /* 5761 * We could return errors for these cases, but that could get 5762 * ugly and we'd probably do the same thing which is just not do 5763 * anything else and exit, so return 1 so the callers don't try 5764 * to use other copies. 5765 */ 5766 return 1; 5767 5768 map = em->map_lookup; 5769 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5770 ret = map->num_stripes; 5771 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5772 ret = map->sub_stripes; 5773 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5774 ret = 2; 5775 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5776 /* 5777 * There could be two corrupted data stripes, we need 5778 * to loop retry in order to rebuild the correct data. 5779 * 5780 * Fail a stripe at a time on every retry except the 5781 * stripe under reconstruction. 5782 */ 5783 ret = map->num_stripes; 5784 else 5785 ret = 1; 5786 free_extent_map(em); 5787 5788 down_read(&fs_info->dev_replace.rwsem); 5789 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5790 fs_info->dev_replace.tgtdev) 5791 ret++; 5792 up_read(&fs_info->dev_replace.rwsem); 5793 5794 return ret; 5795 } 5796 5797 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5798 u64 logical) 5799 { 5800 struct extent_map *em; 5801 struct map_lookup *map; 5802 unsigned long len = fs_info->sectorsize; 5803 5804 em = btrfs_get_chunk_map(fs_info, logical, len); 5805 5806 if (!WARN_ON(IS_ERR(em))) { 5807 map = em->map_lookup; 5808 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5809 len = map->stripe_len * nr_data_stripes(map); 5810 free_extent_map(em); 5811 } 5812 return len; 5813 } 5814 5815 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5816 { 5817 struct extent_map *em; 5818 struct map_lookup *map; 5819 int ret = 0; 5820 5821 em = btrfs_get_chunk_map(fs_info, logical, len); 5822 5823 if(!WARN_ON(IS_ERR(em))) { 5824 map = em->map_lookup; 5825 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5826 ret = 1; 5827 free_extent_map(em); 5828 } 5829 return ret; 5830 } 5831 5832 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5833 struct map_lookup *map, int first, 5834 int dev_replace_is_ongoing) 5835 { 5836 int i; 5837 int num_stripes; 5838 int preferred_mirror; 5839 int tolerance; 5840 struct btrfs_device *srcdev; 5841 5842 ASSERT((map->type & 5843 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5844 5845 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5846 num_stripes = map->sub_stripes; 5847 else 5848 num_stripes = map->num_stripes; 5849 5850 switch (fs_info->fs_devices->read_policy) { 5851 default: 5852 /* Shouldn't happen, just warn and use pid instead of failing */ 5853 btrfs_warn_rl(fs_info, 5854 "unknown read_policy type %u, reset to pid", 5855 fs_info->fs_devices->read_policy); 5856 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5857 fallthrough; 5858 case BTRFS_READ_POLICY_PID: 5859 preferred_mirror = first + (current->pid % num_stripes); 5860 break; 5861 } 5862 5863 if (dev_replace_is_ongoing && 5864 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5865 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5866 srcdev = fs_info->dev_replace.srcdev; 5867 else 5868 srcdev = NULL; 5869 5870 /* 5871 * try to avoid the drive that is the source drive for a 5872 * dev-replace procedure, only choose it if no other non-missing 5873 * mirror is available 5874 */ 5875 for (tolerance = 0; tolerance < 2; tolerance++) { 5876 if (map->stripes[preferred_mirror].dev->bdev && 5877 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5878 return preferred_mirror; 5879 for (i = first; i < first + num_stripes; i++) { 5880 if (map->stripes[i].dev->bdev && 5881 (tolerance || map->stripes[i].dev != srcdev)) 5882 return i; 5883 } 5884 } 5885 5886 /* we couldn't find one that doesn't fail. Just return something 5887 * and the io error handling code will clean up eventually 5888 */ 5889 return preferred_mirror; 5890 } 5891 5892 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5893 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 5894 { 5895 int i; 5896 int again = 1; 5897 5898 while (again) { 5899 again = 0; 5900 for (i = 0; i < num_stripes - 1; i++) { 5901 /* Swap if parity is on a smaller index */ 5902 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 5903 swap(bioc->stripes[i], bioc->stripes[i + 1]); 5904 swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 5905 again = 1; 5906 } 5907 } 5908 } 5909 } 5910 5911 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5912 int total_stripes, 5913 int real_stripes) 5914 { 5915 struct btrfs_io_context *bioc = kzalloc( 5916 /* The size of btrfs_io_context */ 5917 sizeof(struct btrfs_io_context) + 5918 /* Plus the variable array for the stripes */ 5919 sizeof(struct btrfs_io_stripe) * (total_stripes) + 5920 /* Plus the variable array for the tgt dev */ 5921 sizeof(int) * (real_stripes) + 5922 /* 5923 * Plus the raid_map, which includes both the tgt dev 5924 * and the stripes. 5925 */ 5926 sizeof(u64) * (total_stripes), 5927 GFP_NOFS|__GFP_NOFAIL); 5928 5929 atomic_set(&bioc->error, 0); 5930 refcount_set(&bioc->refs, 1); 5931 5932 bioc->fs_info = fs_info; 5933 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 5934 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5935 5936 return bioc; 5937 } 5938 5939 void btrfs_get_bioc(struct btrfs_io_context *bioc) 5940 { 5941 WARN_ON(!refcount_read(&bioc->refs)); 5942 refcount_inc(&bioc->refs); 5943 } 5944 5945 void btrfs_put_bioc(struct btrfs_io_context *bioc) 5946 { 5947 if (!bioc) 5948 return; 5949 if (refcount_dec_and_test(&bioc->refs)) 5950 kfree(bioc); 5951 } 5952 5953 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5954 /* 5955 * Please note that, discard won't be sent to target device of device 5956 * replace. 5957 */ 5958 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5959 u64 logical, u64 *length_ret, 5960 struct btrfs_io_context **bioc_ret) 5961 { 5962 struct extent_map *em; 5963 struct map_lookup *map; 5964 struct btrfs_io_context *bioc; 5965 u64 length = *length_ret; 5966 u64 offset; 5967 u64 stripe_nr; 5968 u64 stripe_nr_end; 5969 u64 stripe_end_offset; 5970 u64 stripe_cnt; 5971 u64 stripe_len; 5972 u64 stripe_offset; 5973 u64 num_stripes; 5974 u32 stripe_index; 5975 u32 factor = 0; 5976 u32 sub_stripes = 0; 5977 u64 stripes_per_dev = 0; 5978 u32 remaining_stripes = 0; 5979 u32 last_stripe = 0; 5980 int ret = 0; 5981 int i; 5982 5983 /* Discard always returns a bioc. */ 5984 ASSERT(bioc_ret); 5985 5986 em = btrfs_get_chunk_map(fs_info, logical, length); 5987 if (IS_ERR(em)) 5988 return PTR_ERR(em); 5989 5990 map = em->map_lookup; 5991 /* we don't discard raid56 yet */ 5992 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5993 ret = -EOPNOTSUPP; 5994 goto out; 5995 } 5996 5997 offset = logical - em->start; 5998 length = min_t(u64, em->start + em->len - logical, length); 5999 *length_ret = length; 6000 6001 stripe_len = map->stripe_len; 6002 /* 6003 * stripe_nr counts the total number of stripes we have to stride 6004 * to get to this block 6005 */ 6006 stripe_nr = div64_u64(offset, stripe_len); 6007 6008 /* stripe_offset is the offset of this block in its stripe */ 6009 stripe_offset = offset - stripe_nr * stripe_len; 6010 6011 stripe_nr_end = round_up(offset + length, map->stripe_len); 6012 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 6013 stripe_cnt = stripe_nr_end - stripe_nr; 6014 stripe_end_offset = stripe_nr_end * map->stripe_len - 6015 (offset + length); 6016 /* 6017 * after this, stripe_nr is the number of stripes on this 6018 * device we have to walk to find the data, and stripe_index is 6019 * the number of our device in the stripe array 6020 */ 6021 num_stripes = 1; 6022 stripe_index = 0; 6023 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6024 BTRFS_BLOCK_GROUP_RAID10)) { 6025 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6026 sub_stripes = 1; 6027 else 6028 sub_stripes = map->sub_stripes; 6029 6030 factor = map->num_stripes / sub_stripes; 6031 num_stripes = min_t(u64, map->num_stripes, 6032 sub_stripes * stripe_cnt); 6033 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6034 stripe_index *= sub_stripes; 6035 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 6036 &remaining_stripes); 6037 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 6038 last_stripe *= sub_stripes; 6039 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6040 BTRFS_BLOCK_GROUP_DUP)) { 6041 num_stripes = map->num_stripes; 6042 } else { 6043 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6044 &stripe_index); 6045 } 6046 6047 bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); 6048 if (!bioc) { 6049 ret = -ENOMEM; 6050 goto out; 6051 } 6052 6053 for (i = 0; i < num_stripes; i++) { 6054 bioc->stripes[i].physical = 6055 map->stripes[stripe_index].physical + 6056 stripe_offset + stripe_nr * map->stripe_len; 6057 bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6058 6059 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6060 BTRFS_BLOCK_GROUP_RAID10)) { 6061 bioc->stripes[i].length = stripes_per_dev * 6062 map->stripe_len; 6063 6064 if (i / sub_stripes < remaining_stripes) 6065 bioc->stripes[i].length += map->stripe_len; 6066 6067 /* 6068 * Special for the first stripe and 6069 * the last stripe: 6070 * 6071 * |-------|...|-------| 6072 * |----------| 6073 * off end_off 6074 */ 6075 if (i < sub_stripes) 6076 bioc->stripes[i].length -= stripe_offset; 6077 6078 if (stripe_index >= last_stripe && 6079 stripe_index <= (last_stripe + 6080 sub_stripes - 1)) 6081 bioc->stripes[i].length -= stripe_end_offset; 6082 6083 if (i == sub_stripes - 1) 6084 stripe_offset = 0; 6085 } else { 6086 bioc->stripes[i].length = length; 6087 } 6088 6089 stripe_index++; 6090 if (stripe_index == map->num_stripes) { 6091 stripe_index = 0; 6092 stripe_nr++; 6093 } 6094 } 6095 6096 *bioc_ret = bioc; 6097 bioc->map_type = map->type; 6098 bioc->num_stripes = num_stripes; 6099 out: 6100 free_extent_map(em); 6101 return ret; 6102 } 6103 6104 /* 6105 * In dev-replace case, for repair case (that's the only case where the mirror 6106 * is selected explicitly when calling btrfs_map_block), blocks left of the 6107 * left cursor can also be read from the target drive. 6108 * 6109 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6110 * array of stripes. 6111 * For READ, it also needs to be supported using the same mirror number. 6112 * 6113 * If the requested block is not left of the left cursor, EIO is returned. This 6114 * can happen because btrfs_num_copies() returns one more in the dev-replace 6115 * case. 6116 */ 6117 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6118 u64 logical, u64 length, 6119 u64 srcdev_devid, int *mirror_num, 6120 u64 *physical) 6121 { 6122 struct btrfs_io_context *bioc = NULL; 6123 int num_stripes; 6124 int index_srcdev = 0; 6125 int found = 0; 6126 u64 physical_of_found = 0; 6127 int i; 6128 int ret = 0; 6129 6130 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6131 logical, &length, &bioc, 0, 0); 6132 if (ret) { 6133 ASSERT(bioc == NULL); 6134 return ret; 6135 } 6136 6137 num_stripes = bioc->num_stripes; 6138 if (*mirror_num > num_stripes) { 6139 /* 6140 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6141 * that means that the requested area is not left of the left 6142 * cursor 6143 */ 6144 btrfs_put_bioc(bioc); 6145 return -EIO; 6146 } 6147 6148 /* 6149 * process the rest of the function using the mirror_num of the source 6150 * drive. Therefore look it up first. At the end, patch the device 6151 * pointer to the one of the target drive. 6152 */ 6153 for (i = 0; i < num_stripes; i++) { 6154 if (bioc->stripes[i].dev->devid != srcdev_devid) 6155 continue; 6156 6157 /* 6158 * In case of DUP, in order to keep it simple, only add the 6159 * mirror with the lowest physical address 6160 */ 6161 if (found && 6162 physical_of_found <= bioc->stripes[i].physical) 6163 continue; 6164 6165 index_srcdev = i; 6166 found = 1; 6167 physical_of_found = bioc->stripes[i].physical; 6168 } 6169 6170 btrfs_put_bioc(bioc); 6171 6172 ASSERT(found); 6173 if (!found) 6174 return -EIO; 6175 6176 *mirror_num = index_srcdev + 1; 6177 *physical = physical_of_found; 6178 return ret; 6179 } 6180 6181 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6182 { 6183 struct btrfs_block_group *cache; 6184 bool ret; 6185 6186 /* Non zoned filesystem does not use "to_copy" flag */ 6187 if (!btrfs_is_zoned(fs_info)) 6188 return false; 6189 6190 cache = btrfs_lookup_block_group(fs_info, logical); 6191 6192 spin_lock(&cache->lock); 6193 ret = cache->to_copy; 6194 spin_unlock(&cache->lock); 6195 6196 btrfs_put_block_group(cache); 6197 return ret; 6198 } 6199 6200 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6201 struct btrfs_io_context **bioc_ret, 6202 struct btrfs_dev_replace *dev_replace, 6203 u64 logical, 6204 int *num_stripes_ret, int *max_errors_ret) 6205 { 6206 struct btrfs_io_context *bioc = *bioc_ret; 6207 u64 srcdev_devid = dev_replace->srcdev->devid; 6208 int tgtdev_indexes = 0; 6209 int num_stripes = *num_stripes_ret; 6210 int max_errors = *max_errors_ret; 6211 int i; 6212 6213 if (op == BTRFS_MAP_WRITE) { 6214 int index_where_to_add; 6215 6216 /* 6217 * A block group which have "to_copy" set will eventually 6218 * copied by dev-replace process. We can avoid cloning IO here. 6219 */ 6220 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6221 return; 6222 6223 /* 6224 * duplicate the write operations while the dev replace 6225 * procedure is running. Since the copying of the old disk to 6226 * the new disk takes place at run time while the filesystem is 6227 * mounted writable, the regular write operations to the old 6228 * disk have to be duplicated to go to the new disk as well. 6229 * 6230 * Note that device->missing is handled by the caller, and that 6231 * the write to the old disk is already set up in the stripes 6232 * array. 6233 */ 6234 index_where_to_add = num_stripes; 6235 for (i = 0; i < num_stripes; i++) { 6236 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6237 /* write to new disk, too */ 6238 struct btrfs_io_stripe *new = 6239 bioc->stripes + index_where_to_add; 6240 struct btrfs_io_stripe *old = 6241 bioc->stripes + i; 6242 6243 new->physical = old->physical; 6244 new->length = old->length; 6245 new->dev = dev_replace->tgtdev; 6246 bioc->tgtdev_map[i] = index_where_to_add; 6247 index_where_to_add++; 6248 max_errors++; 6249 tgtdev_indexes++; 6250 } 6251 } 6252 num_stripes = index_where_to_add; 6253 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6254 int index_srcdev = 0; 6255 int found = 0; 6256 u64 physical_of_found = 0; 6257 6258 /* 6259 * During the dev-replace procedure, the target drive can also 6260 * be used to read data in case it is needed to repair a corrupt 6261 * block elsewhere. This is possible if the requested area is 6262 * left of the left cursor. In this area, the target drive is a 6263 * full copy of the source drive. 6264 */ 6265 for (i = 0; i < num_stripes; i++) { 6266 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6267 /* 6268 * In case of DUP, in order to keep it simple, 6269 * only add the mirror with the lowest physical 6270 * address 6271 */ 6272 if (found && 6273 physical_of_found <= bioc->stripes[i].physical) 6274 continue; 6275 index_srcdev = i; 6276 found = 1; 6277 physical_of_found = bioc->stripes[i].physical; 6278 } 6279 } 6280 if (found) { 6281 struct btrfs_io_stripe *tgtdev_stripe = 6282 bioc->stripes + num_stripes; 6283 6284 tgtdev_stripe->physical = physical_of_found; 6285 tgtdev_stripe->length = 6286 bioc->stripes[index_srcdev].length; 6287 tgtdev_stripe->dev = dev_replace->tgtdev; 6288 bioc->tgtdev_map[index_srcdev] = num_stripes; 6289 6290 tgtdev_indexes++; 6291 num_stripes++; 6292 } 6293 } 6294 6295 *num_stripes_ret = num_stripes; 6296 *max_errors_ret = max_errors; 6297 bioc->num_tgtdevs = tgtdev_indexes; 6298 *bioc_ret = bioc; 6299 } 6300 6301 static bool need_full_stripe(enum btrfs_map_op op) 6302 { 6303 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6304 } 6305 6306 /* 6307 * Calculate the geometry of a particular (address, len) tuple. This 6308 * information is used to calculate how big a particular bio can get before it 6309 * straddles a stripe. 6310 * 6311 * @fs_info: the filesystem 6312 * @em: mapping containing the logical extent 6313 * @op: type of operation - write or read 6314 * @logical: address that we want to figure out the geometry of 6315 * @io_geom: pointer used to return values 6316 * 6317 * Returns < 0 in case a chunk for the given logical address cannot be found, 6318 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6319 */ 6320 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6321 enum btrfs_map_op op, u64 logical, 6322 struct btrfs_io_geometry *io_geom) 6323 { 6324 struct map_lookup *map; 6325 u64 len; 6326 u64 offset; 6327 u64 stripe_offset; 6328 u64 stripe_nr; 6329 u64 stripe_len; 6330 u64 raid56_full_stripe_start = (u64)-1; 6331 int data_stripes; 6332 6333 ASSERT(op != BTRFS_MAP_DISCARD); 6334 6335 map = em->map_lookup; 6336 /* Offset of this logical address in the chunk */ 6337 offset = logical - em->start; 6338 /* Len of a stripe in a chunk */ 6339 stripe_len = map->stripe_len; 6340 /* Stripe where this block falls in */ 6341 stripe_nr = div64_u64(offset, stripe_len); 6342 /* Offset of stripe in the chunk */ 6343 stripe_offset = stripe_nr * stripe_len; 6344 if (offset < stripe_offset) { 6345 btrfs_crit(fs_info, 6346 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6347 stripe_offset, offset, em->start, logical, stripe_len); 6348 return -EINVAL; 6349 } 6350 6351 /* stripe_offset is the offset of this block in its stripe */ 6352 stripe_offset = offset - stripe_offset; 6353 data_stripes = nr_data_stripes(map); 6354 6355 /* Only stripe based profiles needs to check against stripe length. */ 6356 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { 6357 u64 max_len = stripe_len - stripe_offset; 6358 6359 /* 6360 * In case of raid56, we need to know the stripe aligned start 6361 */ 6362 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6363 unsigned long full_stripe_len = stripe_len * data_stripes; 6364 raid56_full_stripe_start = offset; 6365 6366 /* 6367 * Allow a write of a full stripe, but make sure we 6368 * don't allow straddling of stripes 6369 */ 6370 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6371 full_stripe_len); 6372 raid56_full_stripe_start *= full_stripe_len; 6373 6374 /* 6375 * For writes to RAID[56], allow a full stripeset across 6376 * all disks. For other RAID types and for RAID[56] 6377 * reads, just allow a single stripe (on a single disk). 6378 */ 6379 if (op == BTRFS_MAP_WRITE) { 6380 max_len = stripe_len * data_stripes - 6381 (offset - raid56_full_stripe_start); 6382 } 6383 } 6384 len = min_t(u64, em->len - offset, max_len); 6385 } else { 6386 len = em->len - offset; 6387 } 6388 6389 io_geom->len = len; 6390 io_geom->offset = offset; 6391 io_geom->stripe_len = stripe_len; 6392 io_geom->stripe_nr = stripe_nr; 6393 io_geom->stripe_offset = stripe_offset; 6394 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6395 6396 return 0; 6397 } 6398 6399 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6400 enum btrfs_map_op op, 6401 u64 logical, u64 *length, 6402 struct btrfs_io_context **bioc_ret, 6403 int mirror_num, int need_raid_map) 6404 { 6405 struct extent_map *em; 6406 struct map_lookup *map; 6407 u64 stripe_offset; 6408 u64 stripe_nr; 6409 u64 stripe_len; 6410 u32 stripe_index; 6411 int data_stripes; 6412 int i; 6413 int ret = 0; 6414 int num_stripes; 6415 int max_errors = 0; 6416 int tgtdev_indexes = 0; 6417 struct btrfs_io_context *bioc = NULL; 6418 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6419 int dev_replace_is_ongoing = 0; 6420 int num_alloc_stripes; 6421 int patch_the_first_stripe_for_dev_replace = 0; 6422 u64 physical_to_patch_in_first_stripe = 0; 6423 u64 raid56_full_stripe_start = (u64)-1; 6424 struct btrfs_io_geometry geom; 6425 6426 ASSERT(bioc_ret); 6427 ASSERT(op != BTRFS_MAP_DISCARD); 6428 6429 em = btrfs_get_chunk_map(fs_info, logical, *length); 6430 ASSERT(!IS_ERR(em)); 6431 6432 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6433 if (ret < 0) 6434 return ret; 6435 6436 map = em->map_lookup; 6437 6438 *length = geom.len; 6439 stripe_len = geom.stripe_len; 6440 stripe_nr = geom.stripe_nr; 6441 stripe_offset = geom.stripe_offset; 6442 raid56_full_stripe_start = geom.raid56_stripe_offset; 6443 data_stripes = nr_data_stripes(map); 6444 6445 down_read(&dev_replace->rwsem); 6446 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6447 /* 6448 * Hold the semaphore for read during the whole operation, write is 6449 * requested at commit time but must wait. 6450 */ 6451 if (!dev_replace_is_ongoing) 6452 up_read(&dev_replace->rwsem); 6453 6454 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6455 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6456 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6457 dev_replace->srcdev->devid, 6458 &mirror_num, 6459 &physical_to_patch_in_first_stripe); 6460 if (ret) 6461 goto out; 6462 else 6463 patch_the_first_stripe_for_dev_replace = 1; 6464 } else if (mirror_num > map->num_stripes) { 6465 mirror_num = 0; 6466 } 6467 6468 num_stripes = 1; 6469 stripe_index = 0; 6470 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6471 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6472 &stripe_index); 6473 if (!need_full_stripe(op)) 6474 mirror_num = 1; 6475 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6476 if (need_full_stripe(op)) 6477 num_stripes = map->num_stripes; 6478 else if (mirror_num) 6479 stripe_index = mirror_num - 1; 6480 else { 6481 stripe_index = find_live_mirror(fs_info, map, 0, 6482 dev_replace_is_ongoing); 6483 mirror_num = stripe_index + 1; 6484 } 6485 6486 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6487 if (need_full_stripe(op)) { 6488 num_stripes = map->num_stripes; 6489 } else if (mirror_num) { 6490 stripe_index = mirror_num - 1; 6491 } else { 6492 mirror_num = 1; 6493 } 6494 6495 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6496 u32 factor = map->num_stripes / map->sub_stripes; 6497 6498 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6499 stripe_index *= map->sub_stripes; 6500 6501 if (need_full_stripe(op)) 6502 num_stripes = map->sub_stripes; 6503 else if (mirror_num) 6504 stripe_index += mirror_num - 1; 6505 else { 6506 int old_stripe_index = stripe_index; 6507 stripe_index = find_live_mirror(fs_info, map, 6508 stripe_index, 6509 dev_replace_is_ongoing); 6510 mirror_num = stripe_index - old_stripe_index + 1; 6511 } 6512 6513 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6514 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6515 /* push stripe_nr back to the start of the full stripe */ 6516 stripe_nr = div64_u64(raid56_full_stripe_start, 6517 stripe_len * data_stripes); 6518 6519 /* RAID[56] write or recovery. Return all stripes */ 6520 num_stripes = map->num_stripes; 6521 max_errors = nr_parity_stripes(map); 6522 6523 *length = map->stripe_len; 6524 stripe_index = 0; 6525 stripe_offset = 0; 6526 } else { 6527 /* 6528 * Mirror #0 or #1 means the original data block. 6529 * Mirror #2 is RAID5 parity block. 6530 * Mirror #3 is RAID6 Q block. 6531 */ 6532 stripe_nr = div_u64_rem(stripe_nr, 6533 data_stripes, &stripe_index); 6534 if (mirror_num > 1) 6535 stripe_index = data_stripes + mirror_num - 2; 6536 6537 /* We distribute the parity blocks across stripes */ 6538 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6539 &stripe_index); 6540 if (!need_full_stripe(op) && mirror_num <= 1) 6541 mirror_num = 1; 6542 } 6543 } else { 6544 /* 6545 * after this, stripe_nr is the number of stripes on this 6546 * device we have to walk to find the data, and stripe_index is 6547 * the number of our device in the stripe array 6548 */ 6549 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6550 &stripe_index); 6551 mirror_num = stripe_index + 1; 6552 } 6553 if (stripe_index >= map->num_stripes) { 6554 btrfs_crit(fs_info, 6555 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6556 stripe_index, map->num_stripes); 6557 ret = -EINVAL; 6558 goto out; 6559 } 6560 6561 num_alloc_stripes = num_stripes; 6562 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6563 if (op == BTRFS_MAP_WRITE) 6564 num_alloc_stripes <<= 1; 6565 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6566 num_alloc_stripes++; 6567 tgtdev_indexes = num_stripes; 6568 } 6569 6570 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 6571 if (!bioc) { 6572 ret = -ENOMEM; 6573 goto out; 6574 } 6575 6576 for (i = 0; i < num_stripes; i++) { 6577 bioc->stripes[i].physical = map->stripes[stripe_index].physical + 6578 stripe_offset + stripe_nr * map->stripe_len; 6579 bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6580 stripe_index++; 6581 } 6582 6583 /* Build raid_map */ 6584 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6585 (need_full_stripe(op) || mirror_num > 1)) { 6586 u64 tmp; 6587 unsigned rot; 6588 6589 /* Work out the disk rotation on this stripe-set */ 6590 div_u64_rem(stripe_nr, num_stripes, &rot); 6591 6592 /* Fill in the logical address of each stripe */ 6593 tmp = stripe_nr * data_stripes; 6594 for (i = 0; i < data_stripes; i++) 6595 bioc->raid_map[(i + rot) % num_stripes] = 6596 em->start + (tmp + i) * map->stripe_len; 6597 6598 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 6599 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6600 bioc->raid_map[(i + rot + 1) % num_stripes] = 6601 RAID6_Q_STRIPE; 6602 6603 sort_parity_stripes(bioc, num_stripes); 6604 } 6605 6606 if (need_full_stripe(op)) 6607 max_errors = btrfs_chunk_max_errors(map); 6608 6609 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6610 need_full_stripe(op)) { 6611 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 6612 &num_stripes, &max_errors); 6613 } 6614 6615 *bioc_ret = bioc; 6616 bioc->map_type = map->type; 6617 bioc->num_stripes = num_stripes; 6618 bioc->max_errors = max_errors; 6619 bioc->mirror_num = mirror_num; 6620 6621 /* 6622 * this is the case that REQ_READ && dev_replace_is_ongoing && 6623 * mirror_num == num_stripes + 1 && dev_replace target drive is 6624 * available as a mirror 6625 */ 6626 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6627 WARN_ON(num_stripes > 1); 6628 bioc->stripes[0].dev = dev_replace->tgtdev; 6629 bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 6630 bioc->mirror_num = map->num_stripes + 1; 6631 } 6632 out: 6633 if (dev_replace_is_ongoing) { 6634 lockdep_assert_held(&dev_replace->rwsem); 6635 /* Unlock and let waiting writers proceed */ 6636 up_read(&dev_replace->rwsem); 6637 } 6638 free_extent_map(em); 6639 return ret; 6640 } 6641 6642 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6643 u64 logical, u64 *length, 6644 struct btrfs_io_context **bioc_ret, int mirror_num) 6645 { 6646 if (op == BTRFS_MAP_DISCARD) 6647 return __btrfs_map_block_for_discard(fs_info, logical, 6648 length, bioc_ret); 6649 6650 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6651 mirror_num, 0); 6652 } 6653 6654 /* For Scrub/replace */ 6655 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6656 u64 logical, u64 *length, 6657 struct btrfs_io_context **bioc_ret) 6658 { 6659 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); 6660 } 6661 6662 static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) 6663 { 6664 bio->bi_private = bioc->private; 6665 bio->bi_end_io = bioc->end_io; 6666 bio_endio(bio); 6667 6668 btrfs_put_bioc(bioc); 6669 } 6670 6671 static void btrfs_end_bio(struct bio *bio) 6672 { 6673 struct btrfs_io_context *bioc = bio->bi_private; 6674 int is_orig_bio = 0; 6675 6676 if (bio->bi_status) { 6677 atomic_inc(&bioc->error); 6678 if (bio->bi_status == BLK_STS_IOERR || 6679 bio->bi_status == BLK_STS_TARGET) { 6680 struct btrfs_device *dev = btrfs_bio(bio)->device; 6681 6682 ASSERT(dev->bdev); 6683 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6684 btrfs_dev_stat_inc_and_print(dev, 6685 BTRFS_DEV_STAT_WRITE_ERRS); 6686 else if (!(bio->bi_opf & REQ_RAHEAD)) 6687 btrfs_dev_stat_inc_and_print(dev, 6688 BTRFS_DEV_STAT_READ_ERRS); 6689 if (bio->bi_opf & REQ_PREFLUSH) 6690 btrfs_dev_stat_inc_and_print(dev, 6691 BTRFS_DEV_STAT_FLUSH_ERRS); 6692 } 6693 } 6694 6695 if (bio == bioc->orig_bio) 6696 is_orig_bio = 1; 6697 6698 btrfs_bio_counter_dec(bioc->fs_info); 6699 6700 if (atomic_dec_and_test(&bioc->stripes_pending)) { 6701 if (!is_orig_bio) { 6702 bio_put(bio); 6703 bio = bioc->orig_bio; 6704 } 6705 6706 btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6707 /* only send an error to the higher layers if it is 6708 * beyond the tolerance of the btrfs bio 6709 */ 6710 if (atomic_read(&bioc->error) > bioc->max_errors) { 6711 bio->bi_status = BLK_STS_IOERR; 6712 } else { 6713 /* 6714 * this bio is actually up to date, we didn't 6715 * go over the max number of errors 6716 */ 6717 bio->bi_status = BLK_STS_OK; 6718 } 6719 6720 btrfs_end_bioc(bioc, bio); 6721 } else if (!is_orig_bio) { 6722 bio_put(bio); 6723 } 6724 } 6725 6726 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, 6727 u64 physical, struct btrfs_device *dev) 6728 { 6729 struct btrfs_fs_info *fs_info = bioc->fs_info; 6730 6731 bio->bi_private = bioc; 6732 btrfs_bio(bio)->device = dev; 6733 bio->bi_end_io = btrfs_end_bio; 6734 bio->bi_iter.bi_sector = physical >> 9; 6735 /* 6736 * For zone append writing, bi_sector must point the beginning of the 6737 * zone 6738 */ 6739 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6740 if (btrfs_dev_is_sequential(dev, physical)) { 6741 u64 zone_start = round_down(physical, fs_info->zone_size); 6742 6743 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6744 } else { 6745 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6746 bio->bi_opf |= REQ_OP_WRITE; 6747 } 6748 } 6749 btrfs_debug_in_rcu(fs_info, 6750 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6751 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6752 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6753 dev->devid, bio->bi_iter.bi_size); 6754 bio_set_dev(bio, dev->bdev); 6755 6756 btrfs_bio_counter_inc_noblocked(fs_info); 6757 6758 btrfsic_submit_bio(bio); 6759 } 6760 6761 static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) 6762 { 6763 atomic_inc(&bioc->error); 6764 if (atomic_dec_and_test(&bioc->stripes_pending)) { 6765 /* Should be the original bio. */ 6766 WARN_ON(bio != bioc->orig_bio); 6767 6768 btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6769 bio->bi_iter.bi_sector = logical >> 9; 6770 if (atomic_read(&bioc->error) > bioc->max_errors) 6771 bio->bi_status = BLK_STS_IOERR; 6772 else 6773 bio->bi_status = BLK_STS_OK; 6774 btrfs_end_bioc(bioc, bio); 6775 } 6776 } 6777 6778 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6779 int mirror_num) 6780 { 6781 struct btrfs_device *dev; 6782 struct bio *first_bio = bio; 6783 u64 logical = bio->bi_iter.bi_sector << 9; 6784 u64 length = 0; 6785 u64 map_length; 6786 int ret; 6787 int dev_nr; 6788 int total_devs; 6789 struct btrfs_io_context *bioc = NULL; 6790 6791 length = bio->bi_iter.bi_size; 6792 map_length = length; 6793 6794 btrfs_bio_counter_inc_blocked(fs_info); 6795 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6796 &map_length, &bioc, mirror_num, 1); 6797 if (ret) { 6798 btrfs_bio_counter_dec(fs_info); 6799 return errno_to_blk_status(ret); 6800 } 6801 6802 total_devs = bioc->num_stripes; 6803 bioc->orig_bio = first_bio; 6804 bioc->private = first_bio->bi_private; 6805 bioc->end_io = first_bio->bi_end_io; 6806 atomic_set(&bioc->stripes_pending, bioc->num_stripes); 6807 6808 if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6809 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6810 /* In this case, map_length has been set to the length of 6811 a single stripe; not the whole write */ 6812 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6813 ret = raid56_parity_write(bio, bioc, map_length); 6814 } else { 6815 ret = raid56_parity_recover(bio, bioc, map_length, 6816 mirror_num, 1); 6817 } 6818 6819 btrfs_bio_counter_dec(fs_info); 6820 return errno_to_blk_status(ret); 6821 } 6822 6823 if (map_length < length) { 6824 btrfs_crit(fs_info, 6825 "mapping failed logical %llu bio len %llu len %llu", 6826 logical, length, map_length); 6827 BUG(); 6828 } 6829 6830 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6831 dev = bioc->stripes[dev_nr].dev; 6832 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6833 &dev->dev_state) || 6834 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6835 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6836 bioc_error(bioc, first_bio, logical); 6837 continue; 6838 } 6839 6840 if (dev_nr < total_devs - 1) 6841 bio = btrfs_bio_clone(first_bio); 6842 else 6843 bio = first_bio; 6844 6845 submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); 6846 } 6847 btrfs_bio_counter_dec(fs_info); 6848 return BLK_STS_OK; 6849 } 6850 6851 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6852 const struct btrfs_fs_devices *fs_devices) 6853 { 6854 if (args->fsid == NULL) 6855 return true; 6856 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6857 return true; 6858 return false; 6859 } 6860 6861 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6862 const struct btrfs_device *device) 6863 { 6864 ASSERT((args->devid != (u64)-1) || args->missing); 6865 6866 if ((args->devid != (u64)-1) && device->devid != args->devid) 6867 return false; 6868 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6869 return false; 6870 if (!args->missing) 6871 return true; 6872 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6873 !device->bdev) 6874 return true; 6875 return false; 6876 } 6877 6878 /* 6879 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6880 * return NULL. 6881 * 6882 * If devid and uuid are both specified, the match must be exact, otherwise 6883 * only devid is used. 6884 */ 6885 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6886 const struct btrfs_dev_lookup_args *args) 6887 { 6888 struct btrfs_device *device; 6889 struct btrfs_fs_devices *seed_devs; 6890 6891 if (dev_args_match_fs_devices(args, fs_devices)) { 6892 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6893 if (dev_args_match_device(args, device)) 6894 return device; 6895 } 6896 } 6897 6898 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6899 if (!dev_args_match_fs_devices(args, seed_devs)) 6900 continue; 6901 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6902 if (dev_args_match_device(args, device)) 6903 return device; 6904 } 6905 } 6906 6907 return NULL; 6908 } 6909 6910 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6911 u64 devid, u8 *dev_uuid) 6912 { 6913 struct btrfs_device *device; 6914 unsigned int nofs_flag; 6915 6916 /* 6917 * We call this under the chunk_mutex, so we want to use NOFS for this 6918 * allocation, however we don't want to change btrfs_alloc_device() to 6919 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6920 * places. 6921 */ 6922 nofs_flag = memalloc_nofs_save(); 6923 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6924 memalloc_nofs_restore(nofs_flag); 6925 if (IS_ERR(device)) 6926 return device; 6927 6928 list_add(&device->dev_list, &fs_devices->devices); 6929 device->fs_devices = fs_devices; 6930 fs_devices->num_devices++; 6931 6932 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6933 fs_devices->missing_devices++; 6934 6935 return device; 6936 } 6937 6938 /** 6939 * btrfs_alloc_device - allocate struct btrfs_device 6940 * @fs_info: used only for generating a new devid, can be NULL if 6941 * devid is provided (i.e. @devid != NULL). 6942 * @devid: a pointer to devid for this device. If NULL a new devid 6943 * is generated. 6944 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6945 * is generated. 6946 * 6947 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6948 * on error. Returned struct is not linked onto any lists and must be 6949 * destroyed with btrfs_free_device. 6950 */ 6951 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6952 const u64 *devid, 6953 const u8 *uuid) 6954 { 6955 struct btrfs_device *dev; 6956 u64 tmp; 6957 6958 if (WARN_ON(!devid && !fs_info)) 6959 return ERR_PTR(-EINVAL); 6960 6961 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6962 if (!dev) 6963 return ERR_PTR(-ENOMEM); 6964 6965 /* 6966 * Preallocate a bio that's always going to be used for flushing device 6967 * barriers and matches the device lifespan 6968 */ 6969 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6970 if (!dev->flush_bio) { 6971 kfree(dev); 6972 return ERR_PTR(-ENOMEM); 6973 } 6974 6975 INIT_LIST_HEAD(&dev->dev_list); 6976 INIT_LIST_HEAD(&dev->dev_alloc_list); 6977 INIT_LIST_HEAD(&dev->post_commit_list); 6978 6979 atomic_set(&dev->dev_stats_ccnt, 0); 6980 btrfs_device_data_ordered_init(dev); 6981 extent_io_tree_init(fs_info, &dev->alloc_state, 6982 IO_TREE_DEVICE_ALLOC_STATE, NULL); 6983 6984 if (devid) 6985 tmp = *devid; 6986 else { 6987 int ret; 6988 6989 ret = find_next_devid(fs_info, &tmp); 6990 if (ret) { 6991 btrfs_free_device(dev); 6992 return ERR_PTR(ret); 6993 } 6994 } 6995 dev->devid = tmp; 6996 6997 if (uuid) 6998 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6999 else 7000 generate_random_uuid(dev->uuid); 7001 7002 return dev; 7003 } 7004 7005 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 7006 u64 devid, u8 *uuid, bool error) 7007 { 7008 if (error) 7009 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 7010 devid, uuid); 7011 else 7012 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 7013 devid, uuid); 7014 } 7015 7016 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 7017 { 7018 const int data_stripes = calc_data_stripes(type, num_stripes); 7019 7020 return div_u64(chunk_len, data_stripes); 7021 } 7022 7023 #if BITS_PER_LONG == 32 7024 /* 7025 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 7026 * can't be accessed on 32bit systems. 7027 * 7028 * This function do mount time check to reject the fs if it already has 7029 * metadata chunk beyond that limit. 7030 */ 7031 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7032 u64 logical, u64 length, u64 type) 7033 { 7034 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7035 return 0; 7036 7037 if (logical + length < MAX_LFS_FILESIZE) 7038 return 0; 7039 7040 btrfs_err_32bit_limit(fs_info); 7041 return -EOVERFLOW; 7042 } 7043 7044 /* 7045 * This is to give early warning for any metadata chunk reaching 7046 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 7047 * Although we can still access the metadata, it's not going to be possible 7048 * once the limit is reached. 7049 */ 7050 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7051 u64 logical, u64 length, u64 type) 7052 { 7053 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7054 return; 7055 7056 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 7057 return; 7058 7059 btrfs_warn_32bit_limit(fs_info); 7060 } 7061 #endif 7062 7063 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7064 struct btrfs_chunk *chunk) 7065 { 7066 BTRFS_DEV_LOOKUP_ARGS(args); 7067 struct btrfs_fs_info *fs_info = leaf->fs_info; 7068 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7069 struct map_lookup *map; 7070 struct extent_map *em; 7071 u64 logical; 7072 u64 length; 7073 u64 devid; 7074 u64 type; 7075 u8 uuid[BTRFS_UUID_SIZE]; 7076 int num_stripes; 7077 int ret; 7078 int i; 7079 7080 logical = key->offset; 7081 length = btrfs_chunk_length(leaf, chunk); 7082 type = btrfs_chunk_type(leaf, chunk); 7083 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 7084 7085 #if BITS_PER_LONG == 32 7086 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 7087 if (ret < 0) 7088 return ret; 7089 warn_32bit_meta_chunk(fs_info, logical, length, type); 7090 #endif 7091 7092 /* 7093 * Only need to verify chunk item if we're reading from sys chunk array, 7094 * as chunk item in tree block is already verified by tree-checker. 7095 */ 7096 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 7097 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 7098 if (ret) 7099 return ret; 7100 } 7101 7102 read_lock(&map_tree->lock); 7103 em = lookup_extent_mapping(map_tree, logical, 1); 7104 read_unlock(&map_tree->lock); 7105 7106 /* already mapped? */ 7107 if (em && em->start <= logical && em->start + em->len > logical) { 7108 free_extent_map(em); 7109 return 0; 7110 } else if (em) { 7111 free_extent_map(em); 7112 } 7113 7114 em = alloc_extent_map(); 7115 if (!em) 7116 return -ENOMEM; 7117 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7118 if (!map) { 7119 free_extent_map(em); 7120 return -ENOMEM; 7121 } 7122 7123 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7124 em->map_lookup = map; 7125 em->start = logical; 7126 em->len = length; 7127 em->orig_start = 0; 7128 em->block_start = 0; 7129 em->block_len = em->len; 7130 7131 map->num_stripes = num_stripes; 7132 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7133 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7134 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7135 map->type = type; 7136 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7137 map->verified_stripes = 0; 7138 em->orig_block_len = calc_stripe_length(type, em->len, 7139 map->num_stripes); 7140 for (i = 0; i < num_stripes; i++) { 7141 map->stripes[i].physical = 7142 btrfs_stripe_offset_nr(leaf, chunk, i); 7143 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7144 args.devid = devid; 7145 read_extent_buffer(leaf, uuid, (unsigned long) 7146 btrfs_stripe_dev_uuid_nr(chunk, i), 7147 BTRFS_UUID_SIZE); 7148 args.uuid = uuid; 7149 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7150 if (!map->stripes[i].dev && 7151 !btrfs_test_opt(fs_info, DEGRADED)) { 7152 free_extent_map(em); 7153 btrfs_report_missing_device(fs_info, devid, uuid, true); 7154 return -ENOENT; 7155 } 7156 if (!map->stripes[i].dev) { 7157 map->stripes[i].dev = 7158 add_missing_dev(fs_info->fs_devices, devid, 7159 uuid); 7160 if (IS_ERR(map->stripes[i].dev)) { 7161 free_extent_map(em); 7162 btrfs_err(fs_info, 7163 "failed to init missing dev %llu: %ld", 7164 devid, PTR_ERR(map->stripes[i].dev)); 7165 return PTR_ERR(map->stripes[i].dev); 7166 } 7167 btrfs_report_missing_device(fs_info, devid, uuid, false); 7168 } 7169 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7170 &(map->stripes[i].dev->dev_state)); 7171 7172 } 7173 7174 write_lock(&map_tree->lock); 7175 ret = add_extent_mapping(map_tree, em, 0); 7176 write_unlock(&map_tree->lock); 7177 if (ret < 0) { 7178 btrfs_err(fs_info, 7179 "failed to add chunk map, start=%llu len=%llu: %d", 7180 em->start, em->len, ret); 7181 } 7182 free_extent_map(em); 7183 7184 return ret; 7185 } 7186 7187 static void fill_device_from_item(struct extent_buffer *leaf, 7188 struct btrfs_dev_item *dev_item, 7189 struct btrfs_device *device) 7190 { 7191 unsigned long ptr; 7192 7193 device->devid = btrfs_device_id(leaf, dev_item); 7194 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7195 device->total_bytes = device->disk_total_bytes; 7196 device->commit_total_bytes = device->disk_total_bytes; 7197 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7198 device->commit_bytes_used = device->bytes_used; 7199 device->type = btrfs_device_type(leaf, dev_item); 7200 device->io_align = btrfs_device_io_align(leaf, dev_item); 7201 device->io_width = btrfs_device_io_width(leaf, dev_item); 7202 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7203 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7204 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7205 7206 ptr = btrfs_device_uuid(dev_item); 7207 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7208 } 7209 7210 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7211 u8 *fsid) 7212 { 7213 struct btrfs_fs_devices *fs_devices; 7214 int ret; 7215 7216 lockdep_assert_held(&uuid_mutex); 7217 ASSERT(fsid); 7218 7219 /* This will match only for multi-device seed fs */ 7220 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7221 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7222 return fs_devices; 7223 7224 7225 fs_devices = find_fsid(fsid, NULL); 7226 if (!fs_devices) { 7227 if (!btrfs_test_opt(fs_info, DEGRADED)) 7228 return ERR_PTR(-ENOENT); 7229 7230 fs_devices = alloc_fs_devices(fsid, NULL); 7231 if (IS_ERR(fs_devices)) 7232 return fs_devices; 7233 7234 fs_devices->seeding = true; 7235 fs_devices->opened = 1; 7236 return fs_devices; 7237 } 7238 7239 /* 7240 * Upon first call for a seed fs fsid, just create a private copy of the 7241 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7242 */ 7243 fs_devices = clone_fs_devices(fs_devices); 7244 if (IS_ERR(fs_devices)) 7245 return fs_devices; 7246 7247 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7248 if (ret) { 7249 free_fs_devices(fs_devices); 7250 return ERR_PTR(ret); 7251 } 7252 7253 if (!fs_devices->seeding) { 7254 close_fs_devices(fs_devices); 7255 free_fs_devices(fs_devices); 7256 return ERR_PTR(-EINVAL); 7257 } 7258 7259 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7260 7261 return fs_devices; 7262 } 7263 7264 static int read_one_dev(struct extent_buffer *leaf, 7265 struct btrfs_dev_item *dev_item) 7266 { 7267 BTRFS_DEV_LOOKUP_ARGS(args); 7268 struct btrfs_fs_info *fs_info = leaf->fs_info; 7269 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7270 struct btrfs_device *device; 7271 u64 devid; 7272 int ret; 7273 u8 fs_uuid[BTRFS_FSID_SIZE]; 7274 u8 dev_uuid[BTRFS_UUID_SIZE]; 7275 7276 devid = args.devid = btrfs_device_id(leaf, dev_item); 7277 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7278 BTRFS_UUID_SIZE); 7279 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7280 BTRFS_FSID_SIZE); 7281 args.uuid = dev_uuid; 7282 args.fsid = fs_uuid; 7283 7284 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7285 fs_devices = open_seed_devices(fs_info, fs_uuid); 7286 if (IS_ERR(fs_devices)) 7287 return PTR_ERR(fs_devices); 7288 } 7289 7290 device = btrfs_find_device(fs_info->fs_devices, &args); 7291 if (!device) { 7292 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7293 btrfs_report_missing_device(fs_info, devid, 7294 dev_uuid, true); 7295 return -ENOENT; 7296 } 7297 7298 device = add_missing_dev(fs_devices, devid, dev_uuid); 7299 if (IS_ERR(device)) { 7300 btrfs_err(fs_info, 7301 "failed to add missing dev %llu: %ld", 7302 devid, PTR_ERR(device)); 7303 return PTR_ERR(device); 7304 } 7305 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7306 } else { 7307 if (!device->bdev) { 7308 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7309 btrfs_report_missing_device(fs_info, 7310 devid, dev_uuid, true); 7311 return -ENOENT; 7312 } 7313 btrfs_report_missing_device(fs_info, devid, 7314 dev_uuid, false); 7315 } 7316 7317 if (!device->bdev && 7318 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7319 /* 7320 * this happens when a device that was properly setup 7321 * in the device info lists suddenly goes bad. 7322 * device->bdev is NULL, and so we have to set 7323 * device->missing to one here 7324 */ 7325 device->fs_devices->missing_devices++; 7326 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7327 } 7328 7329 /* Move the device to its own fs_devices */ 7330 if (device->fs_devices != fs_devices) { 7331 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7332 &device->dev_state)); 7333 7334 list_move(&device->dev_list, &fs_devices->devices); 7335 device->fs_devices->num_devices--; 7336 fs_devices->num_devices++; 7337 7338 device->fs_devices->missing_devices--; 7339 fs_devices->missing_devices++; 7340 7341 device->fs_devices = fs_devices; 7342 } 7343 } 7344 7345 if (device->fs_devices != fs_info->fs_devices) { 7346 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7347 if (device->generation != 7348 btrfs_device_generation(leaf, dev_item)) 7349 return -EINVAL; 7350 } 7351 7352 fill_device_from_item(leaf, dev_item, device); 7353 if (device->bdev) { 7354 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7355 7356 if (device->total_bytes > max_total_bytes) { 7357 btrfs_err(fs_info, 7358 "device total_bytes should be at most %llu but found %llu", 7359 max_total_bytes, device->total_bytes); 7360 return -EINVAL; 7361 } 7362 } 7363 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7364 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7365 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7366 device->fs_devices->total_rw_bytes += device->total_bytes; 7367 atomic64_add(device->total_bytes - device->bytes_used, 7368 &fs_info->free_chunk_space); 7369 } 7370 ret = 0; 7371 return ret; 7372 } 7373 7374 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7375 { 7376 struct btrfs_root *root = fs_info->tree_root; 7377 struct btrfs_super_block *super_copy = fs_info->super_copy; 7378 struct extent_buffer *sb; 7379 struct btrfs_disk_key *disk_key; 7380 struct btrfs_chunk *chunk; 7381 u8 *array_ptr; 7382 unsigned long sb_array_offset; 7383 int ret = 0; 7384 u32 num_stripes; 7385 u32 array_size; 7386 u32 len = 0; 7387 u32 cur_offset; 7388 u64 type; 7389 struct btrfs_key key; 7390 7391 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7392 /* 7393 * This will create extent buffer of nodesize, superblock size is 7394 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7395 * overallocate but we can keep it as-is, only the first page is used. 7396 */ 7397 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7398 root->root_key.objectid, 0); 7399 if (IS_ERR(sb)) 7400 return PTR_ERR(sb); 7401 set_extent_buffer_uptodate(sb); 7402 /* 7403 * The sb extent buffer is artificial and just used to read the system array. 7404 * set_extent_buffer_uptodate() call does not properly mark all it's 7405 * pages up-to-date when the page is larger: extent does not cover the 7406 * whole page and consequently check_page_uptodate does not find all 7407 * the page's extents up-to-date (the hole beyond sb), 7408 * write_extent_buffer then triggers a WARN_ON. 7409 * 7410 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7411 * but sb spans only this function. Add an explicit SetPageUptodate call 7412 * to silence the warning eg. on PowerPC 64. 7413 */ 7414 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7415 SetPageUptodate(sb->pages[0]); 7416 7417 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7418 array_size = btrfs_super_sys_array_size(super_copy); 7419 7420 array_ptr = super_copy->sys_chunk_array; 7421 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7422 cur_offset = 0; 7423 7424 while (cur_offset < array_size) { 7425 disk_key = (struct btrfs_disk_key *)array_ptr; 7426 len = sizeof(*disk_key); 7427 if (cur_offset + len > array_size) 7428 goto out_short_read; 7429 7430 btrfs_disk_key_to_cpu(&key, disk_key); 7431 7432 array_ptr += len; 7433 sb_array_offset += len; 7434 cur_offset += len; 7435 7436 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7437 btrfs_err(fs_info, 7438 "unexpected item type %u in sys_array at offset %u", 7439 (u32)key.type, cur_offset); 7440 ret = -EIO; 7441 break; 7442 } 7443 7444 chunk = (struct btrfs_chunk *)sb_array_offset; 7445 /* 7446 * At least one btrfs_chunk with one stripe must be present, 7447 * exact stripe count check comes afterwards 7448 */ 7449 len = btrfs_chunk_item_size(1); 7450 if (cur_offset + len > array_size) 7451 goto out_short_read; 7452 7453 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7454 if (!num_stripes) { 7455 btrfs_err(fs_info, 7456 "invalid number of stripes %u in sys_array at offset %u", 7457 num_stripes, cur_offset); 7458 ret = -EIO; 7459 break; 7460 } 7461 7462 type = btrfs_chunk_type(sb, chunk); 7463 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7464 btrfs_err(fs_info, 7465 "invalid chunk type %llu in sys_array at offset %u", 7466 type, cur_offset); 7467 ret = -EIO; 7468 break; 7469 } 7470 7471 len = btrfs_chunk_item_size(num_stripes); 7472 if (cur_offset + len > array_size) 7473 goto out_short_read; 7474 7475 ret = read_one_chunk(&key, sb, chunk); 7476 if (ret) 7477 break; 7478 7479 array_ptr += len; 7480 sb_array_offset += len; 7481 cur_offset += len; 7482 } 7483 clear_extent_buffer_uptodate(sb); 7484 free_extent_buffer_stale(sb); 7485 return ret; 7486 7487 out_short_read: 7488 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7489 len, cur_offset); 7490 clear_extent_buffer_uptodate(sb); 7491 free_extent_buffer_stale(sb); 7492 return -EIO; 7493 } 7494 7495 /* 7496 * Check if all chunks in the fs are OK for read-write degraded mount 7497 * 7498 * If the @failing_dev is specified, it's accounted as missing. 7499 * 7500 * Return true if all chunks meet the minimal RW mount requirements. 7501 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7502 */ 7503 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7504 struct btrfs_device *failing_dev) 7505 { 7506 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7507 struct extent_map *em; 7508 u64 next_start = 0; 7509 bool ret = true; 7510 7511 read_lock(&map_tree->lock); 7512 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7513 read_unlock(&map_tree->lock); 7514 /* No chunk at all? Return false anyway */ 7515 if (!em) { 7516 ret = false; 7517 goto out; 7518 } 7519 while (em) { 7520 struct map_lookup *map; 7521 int missing = 0; 7522 int max_tolerated; 7523 int i; 7524 7525 map = em->map_lookup; 7526 max_tolerated = 7527 btrfs_get_num_tolerated_disk_barrier_failures( 7528 map->type); 7529 for (i = 0; i < map->num_stripes; i++) { 7530 struct btrfs_device *dev = map->stripes[i].dev; 7531 7532 if (!dev || !dev->bdev || 7533 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7534 dev->last_flush_error) 7535 missing++; 7536 else if (failing_dev && failing_dev == dev) 7537 missing++; 7538 } 7539 if (missing > max_tolerated) { 7540 if (!failing_dev) 7541 btrfs_warn(fs_info, 7542 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7543 em->start, missing, max_tolerated); 7544 free_extent_map(em); 7545 ret = false; 7546 goto out; 7547 } 7548 next_start = extent_map_end(em); 7549 free_extent_map(em); 7550 7551 read_lock(&map_tree->lock); 7552 em = lookup_extent_mapping(map_tree, next_start, 7553 (u64)(-1) - next_start); 7554 read_unlock(&map_tree->lock); 7555 } 7556 out: 7557 return ret; 7558 } 7559 7560 static void readahead_tree_node_children(struct extent_buffer *node) 7561 { 7562 int i; 7563 const int nr_items = btrfs_header_nritems(node); 7564 7565 for (i = 0; i < nr_items; i++) 7566 btrfs_readahead_node_child(node, i); 7567 } 7568 7569 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7570 { 7571 struct btrfs_root *root = fs_info->chunk_root; 7572 struct btrfs_path *path; 7573 struct extent_buffer *leaf; 7574 struct btrfs_key key; 7575 struct btrfs_key found_key; 7576 int ret; 7577 int slot; 7578 u64 total_dev = 0; 7579 u64 last_ra_node = 0; 7580 7581 path = btrfs_alloc_path(); 7582 if (!path) 7583 return -ENOMEM; 7584 7585 /* 7586 * uuid_mutex is needed only if we are mounting a sprout FS 7587 * otherwise we don't need it. 7588 */ 7589 mutex_lock(&uuid_mutex); 7590 7591 /* 7592 * It is possible for mount and umount to race in such a way that 7593 * we execute this code path, but open_fs_devices failed to clear 7594 * total_rw_bytes. We certainly want it cleared before reading the 7595 * device items, so clear it here. 7596 */ 7597 fs_info->fs_devices->total_rw_bytes = 0; 7598 7599 /* 7600 * Lockdep complains about possible circular locking dependency between 7601 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7602 * used for freeze procection of a fs (struct super_block.s_writers), 7603 * which we take when starting a transaction, and extent buffers of the 7604 * chunk tree if we call read_one_dev() while holding a lock on an 7605 * extent buffer of the chunk tree. Since we are mounting the filesystem 7606 * and at this point there can't be any concurrent task modifying the 7607 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7608 */ 7609 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7610 path->skip_locking = 1; 7611 7612 /* 7613 * Read all device items, and then all the chunk items. All 7614 * device items are found before any chunk item (their object id 7615 * is smaller than the lowest possible object id for a chunk 7616 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7617 */ 7618 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7619 key.offset = 0; 7620 key.type = 0; 7621 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7622 if (ret < 0) 7623 goto error; 7624 while (1) { 7625 struct extent_buffer *node; 7626 7627 leaf = path->nodes[0]; 7628 slot = path->slots[0]; 7629 if (slot >= btrfs_header_nritems(leaf)) { 7630 ret = btrfs_next_leaf(root, path); 7631 if (ret == 0) 7632 continue; 7633 if (ret < 0) 7634 goto error; 7635 break; 7636 } 7637 node = path->nodes[1]; 7638 if (node) { 7639 if (last_ra_node != node->start) { 7640 readahead_tree_node_children(node); 7641 last_ra_node = node->start; 7642 } 7643 } 7644 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7645 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7646 struct btrfs_dev_item *dev_item; 7647 dev_item = btrfs_item_ptr(leaf, slot, 7648 struct btrfs_dev_item); 7649 ret = read_one_dev(leaf, dev_item); 7650 if (ret) 7651 goto error; 7652 total_dev++; 7653 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7654 struct btrfs_chunk *chunk; 7655 7656 /* 7657 * We are only called at mount time, so no need to take 7658 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7659 * we always lock first fs_info->chunk_mutex before 7660 * acquiring any locks on the chunk tree. This is a 7661 * requirement for chunk allocation, see the comment on 7662 * top of btrfs_chunk_alloc() for details. 7663 */ 7664 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7665 ret = read_one_chunk(&found_key, leaf, chunk); 7666 if (ret) 7667 goto error; 7668 } 7669 path->slots[0]++; 7670 } 7671 7672 /* 7673 * After loading chunk tree, we've got all device information, 7674 * do another round of validation checks. 7675 */ 7676 if (total_dev != fs_info->fs_devices->total_devices) { 7677 btrfs_err(fs_info, 7678 "super_num_devices %llu mismatch with num_devices %llu found here", 7679 btrfs_super_num_devices(fs_info->super_copy), 7680 total_dev); 7681 ret = -EINVAL; 7682 goto error; 7683 } 7684 if (btrfs_super_total_bytes(fs_info->super_copy) < 7685 fs_info->fs_devices->total_rw_bytes) { 7686 btrfs_err(fs_info, 7687 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7688 btrfs_super_total_bytes(fs_info->super_copy), 7689 fs_info->fs_devices->total_rw_bytes); 7690 ret = -EINVAL; 7691 goto error; 7692 } 7693 ret = 0; 7694 error: 7695 mutex_unlock(&uuid_mutex); 7696 7697 btrfs_free_path(path); 7698 return ret; 7699 } 7700 7701 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7702 { 7703 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7704 struct btrfs_device *device; 7705 7706 fs_devices->fs_info = fs_info; 7707 7708 mutex_lock(&fs_devices->device_list_mutex); 7709 list_for_each_entry(device, &fs_devices->devices, dev_list) 7710 device->fs_info = fs_info; 7711 7712 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7713 list_for_each_entry(device, &seed_devs->devices, dev_list) 7714 device->fs_info = fs_info; 7715 7716 seed_devs->fs_info = fs_info; 7717 } 7718 mutex_unlock(&fs_devices->device_list_mutex); 7719 } 7720 7721 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7722 const struct btrfs_dev_stats_item *ptr, 7723 int index) 7724 { 7725 u64 val; 7726 7727 read_extent_buffer(eb, &val, 7728 offsetof(struct btrfs_dev_stats_item, values) + 7729 ((unsigned long)ptr) + (index * sizeof(u64)), 7730 sizeof(val)); 7731 return val; 7732 } 7733 7734 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7735 struct btrfs_dev_stats_item *ptr, 7736 int index, u64 val) 7737 { 7738 write_extent_buffer(eb, &val, 7739 offsetof(struct btrfs_dev_stats_item, values) + 7740 ((unsigned long)ptr) + (index * sizeof(u64)), 7741 sizeof(val)); 7742 } 7743 7744 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7745 struct btrfs_path *path) 7746 { 7747 struct btrfs_dev_stats_item *ptr; 7748 struct extent_buffer *eb; 7749 struct btrfs_key key; 7750 int item_size; 7751 int i, ret, slot; 7752 7753 if (!device->fs_info->dev_root) 7754 return 0; 7755 7756 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7757 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7758 key.offset = device->devid; 7759 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7760 if (ret) { 7761 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7762 btrfs_dev_stat_set(device, i, 0); 7763 device->dev_stats_valid = 1; 7764 btrfs_release_path(path); 7765 return ret < 0 ? ret : 0; 7766 } 7767 slot = path->slots[0]; 7768 eb = path->nodes[0]; 7769 item_size = btrfs_item_size(eb, slot); 7770 7771 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7772 7773 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7774 if (item_size >= (1 + i) * sizeof(__le64)) 7775 btrfs_dev_stat_set(device, i, 7776 btrfs_dev_stats_value(eb, ptr, i)); 7777 else 7778 btrfs_dev_stat_set(device, i, 0); 7779 } 7780 7781 device->dev_stats_valid = 1; 7782 btrfs_dev_stat_print_on_load(device); 7783 btrfs_release_path(path); 7784 7785 return 0; 7786 } 7787 7788 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7789 { 7790 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7791 struct btrfs_device *device; 7792 struct btrfs_path *path = NULL; 7793 int ret = 0; 7794 7795 path = btrfs_alloc_path(); 7796 if (!path) 7797 return -ENOMEM; 7798 7799 mutex_lock(&fs_devices->device_list_mutex); 7800 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7801 ret = btrfs_device_init_dev_stats(device, path); 7802 if (ret) 7803 goto out; 7804 } 7805 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7806 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7807 ret = btrfs_device_init_dev_stats(device, path); 7808 if (ret) 7809 goto out; 7810 } 7811 } 7812 out: 7813 mutex_unlock(&fs_devices->device_list_mutex); 7814 7815 btrfs_free_path(path); 7816 return ret; 7817 } 7818 7819 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7820 struct btrfs_device *device) 7821 { 7822 struct btrfs_fs_info *fs_info = trans->fs_info; 7823 struct btrfs_root *dev_root = fs_info->dev_root; 7824 struct btrfs_path *path; 7825 struct btrfs_key key; 7826 struct extent_buffer *eb; 7827 struct btrfs_dev_stats_item *ptr; 7828 int ret; 7829 int i; 7830 7831 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7832 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7833 key.offset = device->devid; 7834 7835 path = btrfs_alloc_path(); 7836 if (!path) 7837 return -ENOMEM; 7838 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7839 if (ret < 0) { 7840 btrfs_warn_in_rcu(fs_info, 7841 "error %d while searching for dev_stats item for device %s", 7842 ret, rcu_str_deref(device->name)); 7843 goto out; 7844 } 7845 7846 if (ret == 0 && 7847 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7848 /* need to delete old one and insert a new one */ 7849 ret = btrfs_del_item(trans, dev_root, path); 7850 if (ret != 0) { 7851 btrfs_warn_in_rcu(fs_info, 7852 "delete too small dev_stats item for device %s failed %d", 7853 rcu_str_deref(device->name), ret); 7854 goto out; 7855 } 7856 ret = 1; 7857 } 7858 7859 if (ret == 1) { 7860 /* need to insert a new item */ 7861 btrfs_release_path(path); 7862 ret = btrfs_insert_empty_item(trans, dev_root, path, 7863 &key, sizeof(*ptr)); 7864 if (ret < 0) { 7865 btrfs_warn_in_rcu(fs_info, 7866 "insert dev_stats item for device %s failed %d", 7867 rcu_str_deref(device->name), ret); 7868 goto out; 7869 } 7870 } 7871 7872 eb = path->nodes[0]; 7873 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7874 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7875 btrfs_set_dev_stats_value(eb, ptr, i, 7876 btrfs_dev_stat_read(device, i)); 7877 btrfs_mark_buffer_dirty(eb); 7878 7879 out: 7880 btrfs_free_path(path); 7881 return ret; 7882 } 7883 7884 /* 7885 * called from commit_transaction. Writes all changed device stats to disk. 7886 */ 7887 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7888 { 7889 struct btrfs_fs_info *fs_info = trans->fs_info; 7890 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7891 struct btrfs_device *device; 7892 int stats_cnt; 7893 int ret = 0; 7894 7895 mutex_lock(&fs_devices->device_list_mutex); 7896 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7897 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7898 if (!device->dev_stats_valid || stats_cnt == 0) 7899 continue; 7900 7901 7902 /* 7903 * There is a LOAD-LOAD control dependency between the value of 7904 * dev_stats_ccnt and updating the on-disk values which requires 7905 * reading the in-memory counters. Such control dependencies 7906 * require explicit read memory barriers. 7907 * 7908 * This memory barriers pairs with smp_mb__before_atomic in 7909 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7910 * barrier implied by atomic_xchg in 7911 * btrfs_dev_stats_read_and_reset 7912 */ 7913 smp_rmb(); 7914 7915 ret = update_dev_stat_item(trans, device); 7916 if (!ret) 7917 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7918 } 7919 mutex_unlock(&fs_devices->device_list_mutex); 7920 7921 return ret; 7922 } 7923 7924 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7925 { 7926 btrfs_dev_stat_inc(dev, index); 7927 btrfs_dev_stat_print_on_error(dev); 7928 } 7929 7930 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7931 { 7932 if (!dev->dev_stats_valid) 7933 return; 7934 btrfs_err_rl_in_rcu(dev->fs_info, 7935 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7936 rcu_str_deref(dev->name), 7937 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7938 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7939 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7940 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7941 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7942 } 7943 7944 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7945 { 7946 int i; 7947 7948 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7949 if (btrfs_dev_stat_read(dev, i) != 0) 7950 break; 7951 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7952 return; /* all values == 0, suppress message */ 7953 7954 btrfs_info_in_rcu(dev->fs_info, 7955 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7956 rcu_str_deref(dev->name), 7957 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7958 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7959 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7960 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7961 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7962 } 7963 7964 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7965 struct btrfs_ioctl_get_dev_stats *stats) 7966 { 7967 BTRFS_DEV_LOOKUP_ARGS(args); 7968 struct btrfs_device *dev; 7969 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7970 int i; 7971 7972 mutex_lock(&fs_devices->device_list_mutex); 7973 args.devid = stats->devid; 7974 dev = btrfs_find_device(fs_info->fs_devices, &args); 7975 mutex_unlock(&fs_devices->device_list_mutex); 7976 7977 if (!dev) { 7978 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7979 return -ENODEV; 7980 } else if (!dev->dev_stats_valid) { 7981 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7982 return -ENODEV; 7983 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7984 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7985 if (stats->nr_items > i) 7986 stats->values[i] = 7987 btrfs_dev_stat_read_and_reset(dev, i); 7988 else 7989 btrfs_dev_stat_set(dev, i, 0); 7990 } 7991 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7992 current->comm, task_pid_nr(current)); 7993 } else { 7994 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7995 if (stats->nr_items > i) 7996 stats->values[i] = btrfs_dev_stat_read(dev, i); 7997 } 7998 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7999 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 8000 return 0; 8001 } 8002 8003 /* 8004 * Update the size and bytes used for each device where it changed. This is 8005 * delayed since we would otherwise get errors while writing out the 8006 * superblocks. 8007 * 8008 * Must be invoked during transaction commit. 8009 */ 8010 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 8011 { 8012 struct btrfs_device *curr, *next; 8013 8014 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 8015 8016 if (list_empty(&trans->dev_update_list)) 8017 return; 8018 8019 /* 8020 * We don't need the device_list_mutex here. This list is owned by the 8021 * transaction and the transaction must complete before the device is 8022 * released. 8023 */ 8024 mutex_lock(&trans->fs_info->chunk_mutex); 8025 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 8026 post_commit_list) { 8027 list_del_init(&curr->post_commit_list); 8028 curr->commit_total_bytes = curr->disk_total_bytes; 8029 curr->commit_bytes_used = curr->bytes_used; 8030 } 8031 mutex_unlock(&trans->fs_info->chunk_mutex); 8032 } 8033 8034 /* 8035 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 8036 */ 8037 int btrfs_bg_type_to_factor(u64 flags) 8038 { 8039 const int index = btrfs_bg_flags_to_raid_index(flags); 8040 8041 return btrfs_raid_array[index].ncopies; 8042 } 8043 8044 8045 8046 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 8047 u64 chunk_offset, u64 devid, 8048 u64 physical_offset, u64 physical_len) 8049 { 8050 struct btrfs_dev_lookup_args args = { .devid = devid }; 8051 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8052 struct extent_map *em; 8053 struct map_lookup *map; 8054 struct btrfs_device *dev; 8055 u64 stripe_len; 8056 bool found = false; 8057 int ret = 0; 8058 int i; 8059 8060 read_lock(&em_tree->lock); 8061 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 8062 read_unlock(&em_tree->lock); 8063 8064 if (!em) { 8065 btrfs_err(fs_info, 8066 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 8067 physical_offset, devid); 8068 ret = -EUCLEAN; 8069 goto out; 8070 } 8071 8072 map = em->map_lookup; 8073 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 8074 if (physical_len != stripe_len) { 8075 btrfs_err(fs_info, 8076 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 8077 physical_offset, devid, em->start, physical_len, 8078 stripe_len); 8079 ret = -EUCLEAN; 8080 goto out; 8081 } 8082 8083 for (i = 0; i < map->num_stripes; i++) { 8084 if (map->stripes[i].dev->devid == devid && 8085 map->stripes[i].physical == physical_offset) { 8086 found = true; 8087 if (map->verified_stripes >= map->num_stripes) { 8088 btrfs_err(fs_info, 8089 "too many dev extents for chunk %llu found", 8090 em->start); 8091 ret = -EUCLEAN; 8092 goto out; 8093 } 8094 map->verified_stripes++; 8095 break; 8096 } 8097 } 8098 if (!found) { 8099 btrfs_err(fs_info, 8100 "dev extent physical offset %llu devid %llu has no corresponding chunk", 8101 physical_offset, devid); 8102 ret = -EUCLEAN; 8103 } 8104 8105 /* Make sure no dev extent is beyond device boundary */ 8106 dev = btrfs_find_device(fs_info->fs_devices, &args); 8107 if (!dev) { 8108 btrfs_err(fs_info, "failed to find devid %llu", devid); 8109 ret = -EUCLEAN; 8110 goto out; 8111 } 8112 8113 if (physical_offset + physical_len > dev->disk_total_bytes) { 8114 btrfs_err(fs_info, 8115 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 8116 devid, physical_offset, physical_len, 8117 dev->disk_total_bytes); 8118 ret = -EUCLEAN; 8119 goto out; 8120 } 8121 8122 if (dev->zone_info) { 8123 u64 zone_size = dev->zone_info->zone_size; 8124 8125 if (!IS_ALIGNED(physical_offset, zone_size) || 8126 !IS_ALIGNED(physical_len, zone_size)) { 8127 btrfs_err(fs_info, 8128 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8129 devid, physical_offset, physical_len); 8130 ret = -EUCLEAN; 8131 goto out; 8132 } 8133 } 8134 8135 out: 8136 free_extent_map(em); 8137 return ret; 8138 } 8139 8140 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8141 { 8142 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8143 struct extent_map *em; 8144 struct rb_node *node; 8145 int ret = 0; 8146 8147 read_lock(&em_tree->lock); 8148 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8149 em = rb_entry(node, struct extent_map, rb_node); 8150 if (em->map_lookup->num_stripes != 8151 em->map_lookup->verified_stripes) { 8152 btrfs_err(fs_info, 8153 "chunk %llu has missing dev extent, have %d expect %d", 8154 em->start, em->map_lookup->verified_stripes, 8155 em->map_lookup->num_stripes); 8156 ret = -EUCLEAN; 8157 goto out; 8158 } 8159 } 8160 out: 8161 read_unlock(&em_tree->lock); 8162 return ret; 8163 } 8164 8165 /* 8166 * Ensure that all dev extents are mapped to correct chunk, otherwise 8167 * later chunk allocation/free would cause unexpected behavior. 8168 * 8169 * NOTE: This will iterate through the whole device tree, which should be of 8170 * the same size level as the chunk tree. This slightly increases mount time. 8171 */ 8172 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8173 { 8174 struct btrfs_path *path; 8175 struct btrfs_root *root = fs_info->dev_root; 8176 struct btrfs_key key; 8177 u64 prev_devid = 0; 8178 u64 prev_dev_ext_end = 0; 8179 int ret = 0; 8180 8181 /* 8182 * We don't have a dev_root because we mounted with ignorebadroots and 8183 * failed to load the root, so we want to skip the verification in this 8184 * case for sure. 8185 * 8186 * However if the dev root is fine, but the tree itself is corrupted 8187 * we'd still fail to mount. This verification is only to make sure 8188 * writes can happen safely, so instead just bypass this check 8189 * completely in the case of IGNOREBADROOTS. 8190 */ 8191 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8192 return 0; 8193 8194 key.objectid = 1; 8195 key.type = BTRFS_DEV_EXTENT_KEY; 8196 key.offset = 0; 8197 8198 path = btrfs_alloc_path(); 8199 if (!path) 8200 return -ENOMEM; 8201 8202 path->reada = READA_FORWARD; 8203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8204 if (ret < 0) 8205 goto out; 8206 8207 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8208 ret = btrfs_next_leaf(root, path); 8209 if (ret < 0) 8210 goto out; 8211 /* No dev extents at all? Not good */ 8212 if (ret > 0) { 8213 ret = -EUCLEAN; 8214 goto out; 8215 } 8216 } 8217 while (1) { 8218 struct extent_buffer *leaf = path->nodes[0]; 8219 struct btrfs_dev_extent *dext; 8220 int slot = path->slots[0]; 8221 u64 chunk_offset; 8222 u64 physical_offset; 8223 u64 physical_len; 8224 u64 devid; 8225 8226 btrfs_item_key_to_cpu(leaf, &key, slot); 8227 if (key.type != BTRFS_DEV_EXTENT_KEY) 8228 break; 8229 devid = key.objectid; 8230 physical_offset = key.offset; 8231 8232 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8233 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8234 physical_len = btrfs_dev_extent_length(leaf, dext); 8235 8236 /* Check if this dev extent overlaps with the previous one */ 8237 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8238 btrfs_err(fs_info, 8239 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8240 devid, physical_offset, prev_dev_ext_end); 8241 ret = -EUCLEAN; 8242 goto out; 8243 } 8244 8245 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8246 physical_offset, physical_len); 8247 if (ret < 0) 8248 goto out; 8249 prev_devid = devid; 8250 prev_dev_ext_end = physical_offset + physical_len; 8251 8252 ret = btrfs_next_item(root, path); 8253 if (ret < 0) 8254 goto out; 8255 if (ret > 0) { 8256 ret = 0; 8257 break; 8258 } 8259 } 8260 8261 /* Ensure all chunks have corresponding dev extents */ 8262 ret = verify_chunk_dev_extent_mapping(fs_info); 8263 out: 8264 btrfs_free_path(path); 8265 return ret; 8266 } 8267 8268 /* 8269 * Check whether the given block group or device is pinned by any inode being 8270 * used as a swapfile. 8271 */ 8272 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8273 { 8274 struct btrfs_swapfile_pin *sp; 8275 struct rb_node *node; 8276 8277 spin_lock(&fs_info->swapfile_pins_lock); 8278 node = fs_info->swapfile_pins.rb_node; 8279 while (node) { 8280 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8281 if (ptr < sp->ptr) 8282 node = node->rb_left; 8283 else if (ptr > sp->ptr) 8284 node = node->rb_right; 8285 else 8286 break; 8287 } 8288 spin_unlock(&fs_info->swapfile_pins_lock); 8289 return node != NULL; 8290 } 8291 8292 static int relocating_repair_kthread(void *data) 8293 { 8294 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8295 struct btrfs_fs_info *fs_info = cache->fs_info; 8296 u64 target; 8297 int ret = 0; 8298 8299 target = cache->start; 8300 btrfs_put_block_group(cache); 8301 8302 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8303 btrfs_info(fs_info, 8304 "zoned: skip relocating block group %llu to repair: EBUSY", 8305 target); 8306 return -EBUSY; 8307 } 8308 8309 mutex_lock(&fs_info->reclaim_bgs_lock); 8310 8311 /* Ensure block group still exists */ 8312 cache = btrfs_lookup_block_group(fs_info, target); 8313 if (!cache) 8314 goto out; 8315 8316 if (!cache->relocating_repair) 8317 goto out; 8318 8319 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8320 if (ret < 0) 8321 goto out; 8322 8323 btrfs_info(fs_info, 8324 "zoned: relocating block group %llu to repair IO failure", 8325 target); 8326 ret = btrfs_relocate_chunk(fs_info, target); 8327 8328 out: 8329 if (cache) 8330 btrfs_put_block_group(cache); 8331 mutex_unlock(&fs_info->reclaim_bgs_lock); 8332 btrfs_exclop_finish(fs_info); 8333 8334 return ret; 8335 } 8336 8337 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8338 { 8339 struct btrfs_block_group *cache; 8340 8341 if (!btrfs_is_zoned(fs_info)) 8342 return false; 8343 8344 /* Do not attempt to repair in degraded state */ 8345 if (btrfs_test_opt(fs_info, DEGRADED)) 8346 return true; 8347 8348 cache = btrfs_lookup_block_group(fs_info, logical); 8349 if (!cache) 8350 return true; 8351 8352 spin_lock(&cache->lock); 8353 if (cache->relocating_repair) { 8354 spin_unlock(&cache->lock); 8355 btrfs_put_block_group(cache); 8356 return true; 8357 } 8358 cache->relocating_repair = 1; 8359 spin_unlock(&cache->lock); 8360 8361 kthread_run(relocating_repair_kthread, cache, 8362 "btrfs-relocating-repair"); 8363 8364 return true; 8365 } 8366