1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "misc.h" 18 #include "ctree.h" 19 #include "extent_map.h" 20 #include "disk-io.h" 21 #include "transaction.h" 22 #include "print-tree.h" 23 #include "volumes.h" 24 #include "raid56.h" 25 #include "async-thread.h" 26 #include "check-integrity.h" 27 #include "rcu-string.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 #include "tree-checker.h" 31 #include "space-info.h" 32 #include "block-group.h" 33 #include "discard.h" 34 #include "zoned.h" 35 36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 37 [BTRFS_RAID_RAID10] = { 38 .sub_stripes = 2, 39 .dev_stripes = 1, 40 .devs_max = 0, /* 0 == as many as possible */ 41 .devs_min = 2, 42 .tolerated_failures = 1, 43 .devs_increment = 2, 44 .ncopies = 2, 45 .nparity = 0, 46 .raid_name = "raid10", 47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 49 }, 50 [BTRFS_RAID_RAID1] = { 51 .sub_stripes = 1, 52 .dev_stripes = 1, 53 .devs_max = 2, 54 .devs_min = 2, 55 .tolerated_failures = 1, 56 .devs_increment = 2, 57 .ncopies = 2, 58 .nparity = 0, 59 .raid_name = "raid1", 60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 62 }, 63 [BTRFS_RAID_RAID1C3] = { 64 .sub_stripes = 1, 65 .dev_stripes = 1, 66 .devs_max = 3, 67 .devs_min = 3, 68 .tolerated_failures = 2, 69 .devs_increment = 3, 70 .ncopies = 3, 71 .nparity = 0, 72 .raid_name = "raid1c3", 73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 75 }, 76 [BTRFS_RAID_RAID1C4] = { 77 .sub_stripes = 1, 78 .dev_stripes = 1, 79 .devs_max = 4, 80 .devs_min = 4, 81 .tolerated_failures = 3, 82 .devs_increment = 4, 83 .ncopies = 4, 84 .nparity = 0, 85 .raid_name = "raid1c4", 86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 88 }, 89 [BTRFS_RAID_DUP] = { 90 .sub_stripes = 1, 91 .dev_stripes = 2, 92 .devs_max = 1, 93 .devs_min = 1, 94 .tolerated_failures = 0, 95 .devs_increment = 1, 96 .ncopies = 2, 97 .nparity = 0, 98 .raid_name = "dup", 99 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 100 .mindev_error = 0, 101 }, 102 [BTRFS_RAID_RAID0] = { 103 .sub_stripes = 1, 104 .dev_stripes = 1, 105 .devs_max = 0, 106 .devs_min = 1, 107 .tolerated_failures = 0, 108 .devs_increment = 1, 109 .ncopies = 1, 110 .nparity = 0, 111 .raid_name = "raid0", 112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 113 .mindev_error = 0, 114 }, 115 [BTRFS_RAID_SINGLE] = { 116 .sub_stripes = 1, 117 .dev_stripes = 1, 118 .devs_max = 1, 119 .devs_min = 1, 120 .tolerated_failures = 0, 121 .devs_increment = 1, 122 .ncopies = 1, 123 .nparity = 0, 124 .raid_name = "single", 125 .bg_flag = 0, 126 .mindev_error = 0, 127 }, 128 [BTRFS_RAID_RAID5] = { 129 .sub_stripes = 1, 130 .dev_stripes = 1, 131 .devs_max = 0, 132 .devs_min = 2, 133 .tolerated_failures = 1, 134 .devs_increment = 1, 135 .ncopies = 1, 136 .nparity = 1, 137 .raid_name = "raid5", 138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 140 }, 141 [BTRFS_RAID_RAID6] = { 142 .sub_stripes = 1, 143 .dev_stripes = 1, 144 .devs_max = 0, 145 .devs_min = 3, 146 .tolerated_failures = 2, 147 .devs_increment = 1, 148 .ncopies = 1, 149 .nparity = 2, 150 .raid_name = "raid6", 151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 153 }, 154 }; 155 156 /* 157 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 158 * can be used as index to access btrfs_raid_array[]. 159 */ 160 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 161 { 162 if (flags & BTRFS_BLOCK_GROUP_RAID10) 163 return BTRFS_RAID_RAID10; 164 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 165 return BTRFS_RAID_RAID1; 166 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 167 return BTRFS_RAID_RAID1C3; 168 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 169 return BTRFS_RAID_RAID1C4; 170 else if (flags & BTRFS_BLOCK_GROUP_DUP) 171 return BTRFS_RAID_DUP; 172 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 173 return BTRFS_RAID_RAID0; 174 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 175 return BTRFS_RAID_RAID5; 176 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 177 return BTRFS_RAID_RAID6; 178 179 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 180 } 181 182 const char *btrfs_bg_type_to_raid_name(u64 flags) 183 { 184 const int index = btrfs_bg_flags_to_raid_index(flags); 185 186 if (index >= BTRFS_NR_RAID_TYPES) 187 return NULL; 188 189 return btrfs_raid_array[index].raid_name; 190 } 191 192 /* 193 * Fill @buf with textual description of @bg_flags, no more than @size_buf 194 * bytes including terminating null byte. 195 */ 196 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 197 { 198 int i; 199 int ret; 200 char *bp = buf; 201 u64 flags = bg_flags; 202 u32 size_bp = size_buf; 203 204 if (!flags) { 205 strcpy(bp, "NONE"); 206 return; 207 } 208 209 #define DESCRIBE_FLAG(flag, desc) \ 210 do { \ 211 if (flags & (flag)) { \ 212 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 213 if (ret < 0 || ret >= size_bp) \ 214 goto out_overflow; \ 215 size_bp -= ret; \ 216 bp += ret; \ 217 flags &= ~(flag); \ 218 } \ 219 } while (0) 220 221 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 222 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 224 225 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 226 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 227 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 228 btrfs_raid_array[i].raid_name); 229 #undef DESCRIBE_FLAG 230 231 if (flags) { 232 ret = snprintf(bp, size_bp, "0x%llx|", flags); 233 size_bp -= ret; 234 } 235 236 if (size_bp < size_buf) 237 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 238 239 /* 240 * The text is trimmed, it's up to the caller to provide sufficiently 241 * large buffer 242 */ 243 out_overflow:; 244 } 245 246 static int init_first_rw_device(struct btrfs_trans_handle *trans); 247 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 248 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 249 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 250 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 251 enum btrfs_map_op op, 252 u64 logical, u64 *length, 253 struct btrfs_bio **bbio_ret, 254 int mirror_num, int need_raid_map); 255 256 /* 257 * Device locking 258 * ============== 259 * 260 * There are several mutexes that protect manipulation of devices and low-level 261 * structures like chunks but not block groups, extents or files 262 * 263 * uuid_mutex (global lock) 264 * ------------------------ 265 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 266 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 267 * device) or requested by the device= mount option 268 * 269 * the mutex can be very coarse and can cover long-running operations 270 * 271 * protects: updates to fs_devices counters like missing devices, rw devices, 272 * seeding, structure cloning, opening/closing devices at mount/umount time 273 * 274 * global::fs_devs - add, remove, updates to the global list 275 * 276 * does not protect: manipulation of the fs_devices::devices list in general 277 * but in mount context it could be used to exclude list modifications by eg. 278 * scan ioctl 279 * 280 * btrfs_device::name - renames (write side), read is RCU 281 * 282 * fs_devices::device_list_mutex (per-fs, with RCU) 283 * ------------------------------------------------ 284 * protects updates to fs_devices::devices, ie. adding and deleting 285 * 286 * simple list traversal with read-only actions can be done with RCU protection 287 * 288 * may be used to exclude some operations from running concurrently without any 289 * modifications to the list (see write_all_supers) 290 * 291 * Is not required at mount and close times, because our device list is 292 * protected by the uuid_mutex at that point. 293 * 294 * balance_mutex 295 * ------------- 296 * protects balance structures (status, state) and context accessed from 297 * several places (internally, ioctl) 298 * 299 * chunk_mutex 300 * ----------- 301 * protects chunks, adding or removing during allocation, trim or when a new 302 * device is added/removed. Additionally it also protects post_commit_list of 303 * individual devices, since they can be added to the transaction's 304 * post_commit_list only with chunk_mutex held. 305 * 306 * cleaner_mutex 307 * ------------- 308 * a big lock that is held by the cleaner thread and prevents running subvolume 309 * cleaning together with relocation or delayed iputs 310 * 311 * 312 * Lock nesting 313 * ============ 314 * 315 * uuid_mutex 316 * device_list_mutex 317 * chunk_mutex 318 * balance_mutex 319 * 320 * 321 * Exclusive operations 322 * ==================== 323 * 324 * Maintains the exclusivity of the following operations that apply to the 325 * whole filesystem and cannot run in parallel. 326 * 327 * - Balance (*) 328 * - Device add 329 * - Device remove 330 * - Device replace (*) 331 * - Resize 332 * 333 * The device operations (as above) can be in one of the following states: 334 * 335 * - Running state 336 * - Paused state 337 * - Completed state 338 * 339 * Only device operations marked with (*) can go into the Paused state for the 340 * following reasons: 341 * 342 * - ioctl (only Balance can be Paused through ioctl) 343 * - filesystem remounted as read-only 344 * - filesystem unmounted and mounted as read-only 345 * - system power-cycle and filesystem mounted as read-only 346 * - filesystem or device errors leading to forced read-only 347 * 348 * The status of exclusive operation is set and cleared atomically. 349 * During the course of Paused state, fs_info::exclusive_operation remains set. 350 * A device operation in Paused or Running state can be canceled or resumed 351 * either by ioctl (Balance only) or when remounted as read-write. 352 * The exclusive status is cleared when the device operation is canceled or 353 * completed. 354 */ 355 356 DEFINE_MUTEX(uuid_mutex); 357 static LIST_HEAD(fs_uuids); 358 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 359 { 360 return &fs_uuids; 361 } 362 363 /* 364 * alloc_fs_devices - allocate struct btrfs_fs_devices 365 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 366 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 367 * 368 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 369 * The returned struct is not linked onto any lists and can be destroyed with 370 * kfree() right away. 371 */ 372 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 373 const u8 *metadata_fsid) 374 { 375 struct btrfs_fs_devices *fs_devs; 376 377 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 378 if (!fs_devs) 379 return ERR_PTR(-ENOMEM); 380 381 mutex_init(&fs_devs->device_list_mutex); 382 383 INIT_LIST_HEAD(&fs_devs->devices); 384 INIT_LIST_HEAD(&fs_devs->alloc_list); 385 INIT_LIST_HEAD(&fs_devs->fs_list); 386 INIT_LIST_HEAD(&fs_devs->seed_list); 387 if (fsid) 388 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 389 390 if (metadata_fsid) 391 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 392 else if (fsid) 393 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 394 395 return fs_devs; 396 } 397 398 void btrfs_free_device(struct btrfs_device *device) 399 { 400 WARN_ON(!list_empty(&device->post_commit_list)); 401 rcu_string_free(device->name); 402 extent_io_tree_release(&device->alloc_state); 403 bio_put(device->flush_bio); 404 btrfs_destroy_dev_zone_info(device); 405 kfree(device); 406 } 407 408 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 409 { 410 struct btrfs_device *device; 411 WARN_ON(fs_devices->opened); 412 while (!list_empty(&fs_devices->devices)) { 413 device = list_entry(fs_devices->devices.next, 414 struct btrfs_device, dev_list); 415 list_del(&device->dev_list); 416 btrfs_free_device(device); 417 } 418 kfree(fs_devices); 419 } 420 421 void __exit btrfs_cleanup_fs_uuids(void) 422 { 423 struct btrfs_fs_devices *fs_devices; 424 425 while (!list_empty(&fs_uuids)) { 426 fs_devices = list_entry(fs_uuids.next, 427 struct btrfs_fs_devices, fs_list); 428 list_del(&fs_devices->fs_list); 429 free_fs_devices(fs_devices); 430 } 431 } 432 433 static noinline struct btrfs_fs_devices *find_fsid( 434 const u8 *fsid, const u8 *metadata_fsid) 435 { 436 struct btrfs_fs_devices *fs_devices; 437 438 ASSERT(fsid); 439 440 /* Handle non-split brain cases */ 441 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 442 if (metadata_fsid) { 443 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 444 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 445 BTRFS_FSID_SIZE) == 0) 446 return fs_devices; 447 } else { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 449 return fs_devices; 450 } 451 } 452 return NULL; 453 } 454 455 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 456 struct btrfs_super_block *disk_super) 457 { 458 459 struct btrfs_fs_devices *fs_devices; 460 461 /* 462 * Handle scanned device having completed its fsid change but 463 * belonging to a fs_devices that was created by first scanning 464 * a device which didn't have its fsid/metadata_uuid changed 465 * at all and the CHANGING_FSID_V2 flag set. 466 */ 467 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 468 if (fs_devices->fsid_change && 469 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 470 BTRFS_FSID_SIZE) == 0 && 471 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 472 BTRFS_FSID_SIZE) == 0) { 473 return fs_devices; 474 } 475 } 476 /* 477 * Handle scanned device having completed its fsid change but 478 * belonging to a fs_devices that was created by a device that 479 * has an outdated pair of fsid/metadata_uuid and 480 * CHANGING_FSID_V2 flag set. 481 */ 482 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 483 if (fs_devices->fsid_change && 484 memcmp(fs_devices->metadata_uuid, 485 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 486 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 487 BTRFS_FSID_SIZE) == 0) { 488 return fs_devices; 489 } 490 } 491 492 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 493 } 494 495 496 static int 497 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 498 int flush, struct block_device **bdev, 499 struct btrfs_super_block **disk_super) 500 { 501 int ret; 502 503 *bdev = blkdev_get_by_path(device_path, flags, holder); 504 505 if (IS_ERR(*bdev)) { 506 ret = PTR_ERR(*bdev); 507 goto error; 508 } 509 510 if (flush) 511 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 512 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 513 if (ret) { 514 blkdev_put(*bdev, flags); 515 goto error; 516 } 517 invalidate_bdev(*bdev); 518 *disk_super = btrfs_read_dev_super(*bdev); 519 if (IS_ERR(*disk_super)) { 520 ret = PTR_ERR(*disk_super); 521 blkdev_put(*bdev, flags); 522 goto error; 523 } 524 525 return 0; 526 527 error: 528 *bdev = NULL; 529 return ret; 530 } 531 532 static bool device_path_matched(const char *path, struct btrfs_device *device) 533 { 534 int found; 535 536 rcu_read_lock(); 537 found = strcmp(rcu_str_deref(device->name), path); 538 rcu_read_unlock(); 539 540 return found == 0; 541 } 542 543 /* 544 * Search and remove all stale (devices which are not mounted) devices. 545 * When both inputs are NULL, it will search and release all stale devices. 546 * path: Optional. When provided will it release all unmounted devices 547 * matching this path only. 548 * skip_dev: Optional. Will skip this device when searching for the stale 549 * devices. 550 * Return: 0 for success or if @path is NULL. 551 * -EBUSY if @path is a mounted device. 552 * -ENOENT if @path does not match any device in the list. 553 */ 554 static int btrfs_free_stale_devices(const char *path, 555 struct btrfs_device *skip_device) 556 { 557 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 558 struct btrfs_device *device, *tmp_device; 559 int ret = 0; 560 561 lockdep_assert_held(&uuid_mutex); 562 563 if (path) 564 ret = -ENOENT; 565 566 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 567 568 mutex_lock(&fs_devices->device_list_mutex); 569 list_for_each_entry_safe(device, tmp_device, 570 &fs_devices->devices, dev_list) { 571 if (skip_device && skip_device == device) 572 continue; 573 if (path && !device->name) 574 continue; 575 if (path && !device_path_matched(path, device)) 576 continue; 577 if (fs_devices->opened) { 578 /* for an already deleted device return 0 */ 579 if (path && ret != 0) 580 ret = -EBUSY; 581 break; 582 } 583 584 /* delete the stale device */ 585 fs_devices->num_devices--; 586 list_del(&device->dev_list); 587 btrfs_free_device(device); 588 589 ret = 0; 590 } 591 mutex_unlock(&fs_devices->device_list_mutex); 592 593 if (fs_devices->num_devices == 0) { 594 btrfs_sysfs_remove_fsid(fs_devices); 595 list_del(&fs_devices->fs_list); 596 free_fs_devices(fs_devices); 597 } 598 } 599 600 return ret; 601 } 602 603 /* 604 * This is only used on mount, and we are protected from competing things 605 * messing with our fs_devices by the uuid_mutex, thus we do not need the 606 * fs_devices->device_list_mutex here. 607 */ 608 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 609 struct btrfs_device *device, fmode_t flags, 610 void *holder) 611 { 612 struct request_queue *q; 613 struct block_device *bdev; 614 struct btrfs_super_block *disk_super; 615 u64 devid; 616 int ret; 617 618 if (device->bdev) 619 return -EINVAL; 620 if (!device->name) 621 return -EINVAL; 622 623 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 624 &bdev, &disk_super); 625 if (ret) 626 return ret; 627 628 devid = btrfs_stack_device_id(&disk_super->dev_item); 629 if (devid != device->devid) 630 goto error_free_page; 631 632 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 633 goto error_free_page; 634 635 device->generation = btrfs_super_generation(disk_super); 636 637 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 638 if (btrfs_super_incompat_flags(disk_super) & 639 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 640 pr_err( 641 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 642 goto error_free_page; 643 } 644 645 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 646 fs_devices->seeding = true; 647 } else { 648 if (bdev_read_only(bdev)) 649 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 650 else 651 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 652 } 653 654 q = bdev_get_queue(bdev); 655 if (!blk_queue_nonrot(q)) 656 fs_devices->rotating = true; 657 658 device->bdev = bdev; 659 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 660 device->mode = flags; 661 662 fs_devices->open_devices++; 663 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 664 device->devid != BTRFS_DEV_REPLACE_DEVID) { 665 fs_devices->rw_devices++; 666 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 667 } 668 btrfs_release_disk_super(disk_super); 669 670 return 0; 671 672 error_free_page: 673 btrfs_release_disk_super(disk_super); 674 blkdev_put(bdev, flags); 675 676 return -EINVAL; 677 } 678 679 /* 680 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 681 * being created with a disk that has already completed its fsid change. Such 682 * disk can belong to an fs which has its FSID changed or to one which doesn't. 683 * Handle both cases here. 684 */ 685 static struct btrfs_fs_devices *find_fsid_inprogress( 686 struct btrfs_super_block *disk_super) 687 { 688 struct btrfs_fs_devices *fs_devices; 689 690 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 691 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 692 BTRFS_FSID_SIZE) != 0 && 693 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 694 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 695 return fs_devices; 696 } 697 } 698 699 return find_fsid(disk_super->fsid, NULL); 700 } 701 702 703 static struct btrfs_fs_devices *find_fsid_changed( 704 struct btrfs_super_block *disk_super) 705 { 706 struct btrfs_fs_devices *fs_devices; 707 708 /* 709 * Handles the case where scanned device is part of an fs that had 710 * multiple successful changes of FSID but currently device didn't 711 * observe it. Meaning our fsid will be different than theirs. We need 712 * to handle two subcases : 713 * 1 - The fs still continues to have different METADATA/FSID uuids. 714 * 2 - The fs is switched back to its original FSID (METADATA/FSID 715 * are equal). 716 */ 717 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 718 /* Changed UUIDs */ 719 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 720 BTRFS_FSID_SIZE) != 0 && 721 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 722 BTRFS_FSID_SIZE) == 0 && 723 memcmp(fs_devices->fsid, disk_super->fsid, 724 BTRFS_FSID_SIZE) != 0) 725 return fs_devices; 726 727 /* Unchanged UUIDs */ 728 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 729 BTRFS_FSID_SIZE) == 0 && 730 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 731 BTRFS_FSID_SIZE) == 0) 732 return fs_devices; 733 } 734 735 return NULL; 736 } 737 738 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 739 struct btrfs_super_block *disk_super) 740 { 741 struct btrfs_fs_devices *fs_devices; 742 743 /* 744 * Handle the case where the scanned device is part of an fs whose last 745 * metadata UUID change reverted it to the original FSID. At the same 746 * time * fs_devices was first created by another constitutent device 747 * which didn't fully observe the operation. This results in an 748 * btrfs_fs_devices created with metadata/fsid different AND 749 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 750 * fs_devices equal to the FSID of the disk. 751 */ 752 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 753 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 754 BTRFS_FSID_SIZE) != 0 && 755 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 756 BTRFS_FSID_SIZE) == 0 && 757 fs_devices->fsid_change) 758 return fs_devices; 759 } 760 761 return NULL; 762 } 763 /* 764 * Add new device to list of registered devices 765 * 766 * Returns: 767 * device pointer which was just added or updated when successful 768 * error pointer when failed 769 */ 770 static noinline struct btrfs_device *device_list_add(const char *path, 771 struct btrfs_super_block *disk_super, 772 bool *new_device_added) 773 { 774 struct btrfs_device *device; 775 struct btrfs_fs_devices *fs_devices = NULL; 776 struct rcu_string *name; 777 u64 found_transid = btrfs_super_generation(disk_super); 778 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 779 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 780 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 781 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 782 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 783 784 if (fsid_change_in_progress) { 785 if (!has_metadata_uuid) 786 fs_devices = find_fsid_inprogress(disk_super); 787 else 788 fs_devices = find_fsid_changed(disk_super); 789 } else if (has_metadata_uuid) { 790 fs_devices = find_fsid_with_metadata_uuid(disk_super); 791 } else { 792 fs_devices = find_fsid_reverted_metadata(disk_super); 793 if (!fs_devices) 794 fs_devices = find_fsid(disk_super->fsid, NULL); 795 } 796 797 798 if (!fs_devices) { 799 if (has_metadata_uuid) 800 fs_devices = alloc_fs_devices(disk_super->fsid, 801 disk_super->metadata_uuid); 802 else 803 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 804 805 if (IS_ERR(fs_devices)) 806 return ERR_CAST(fs_devices); 807 808 fs_devices->fsid_change = fsid_change_in_progress; 809 810 mutex_lock(&fs_devices->device_list_mutex); 811 list_add(&fs_devices->fs_list, &fs_uuids); 812 813 device = NULL; 814 } else { 815 mutex_lock(&fs_devices->device_list_mutex); 816 device = btrfs_find_device(fs_devices, devid, 817 disk_super->dev_item.uuid, NULL); 818 819 /* 820 * If this disk has been pulled into an fs devices created by 821 * a device which had the CHANGING_FSID_V2 flag then replace the 822 * metadata_uuid/fsid values of the fs_devices. 823 */ 824 if (fs_devices->fsid_change && 825 found_transid > fs_devices->latest_generation) { 826 memcpy(fs_devices->fsid, disk_super->fsid, 827 BTRFS_FSID_SIZE); 828 829 if (has_metadata_uuid) 830 memcpy(fs_devices->metadata_uuid, 831 disk_super->metadata_uuid, 832 BTRFS_FSID_SIZE); 833 else 834 memcpy(fs_devices->metadata_uuid, 835 disk_super->fsid, BTRFS_FSID_SIZE); 836 837 fs_devices->fsid_change = false; 838 } 839 } 840 841 if (!device) { 842 if (fs_devices->opened) { 843 mutex_unlock(&fs_devices->device_list_mutex); 844 return ERR_PTR(-EBUSY); 845 } 846 847 device = btrfs_alloc_device(NULL, &devid, 848 disk_super->dev_item.uuid); 849 if (IS_ERR(device)) { 850 mutex_unlock(&fs_devices->device_list_mutex); 851 /* we can safely leave the fs_devices entry around */ 852 return device; 853 } 854 855 name = rcu_string_strdup(path, GFP_NOFS); 856 if (!name) { 857 btrfs_free_device(device); 858 mutex_unlock(&fs_devices->device_list_mutex); 859 return ERR_PTR(-ENOMEM); 860 } 861 rcu_assign_pointer(device->name, name); 862 863 list_add_rcu(&device->dev_list, &fs_devices->devices); 864 fs_devices->num_devices++; 865 866 device->fs_devices = fs_devices; 867 *new_device_added = true; 868 869 if (disk_super->label[0]) 870 pr_info( 871 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 872 disk_super->label, devid, found_transid, path, 873 current->comm, task_pid_nr(current)); 874 else 875 pr_info( 876 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 877 disk_super->fsid, devid, found_transid, path, 878 current->comm, task_pid_nr(current)); 879 880 } else if (!device->name || strcmp(device->name->str, path)) { 881 /* 882 * When FS is already mounted. 883 * 1. If you are here and if the device->name is NULL that 884 * means this device was missing at time of FS mount. 885 * 2. If you are here and if the device->name is different 886 * from 'path' that means either 887 * a. The same device disappeared and reappeared with 888 * different name. or 889 * b. The missing-disk-which-was-replaced, has 890 * reappeared now. 891 * 892 * We must allow 1 and 2a above. But 2b would be a spurious 893 * and unintentional. 894 * 895 * Further in case of 1 and 2a above, the disk at 'path' 896 * would have missed some transaction when it was away and 897 * in case of 2a the stale bdev has to be updated as well. 898 * 2b must not be allowed at all time. 899 */ 900 901 /* 902 * For now, we do allow update to btrfs_fs_device through the 903 * btrfs dev scan cli after FS has been mounted. We're still 904 * tracking a problem where systems fail mount by subvolume id 905 * when we reject replacement on a mounted FS. 906 */ 907 if (!fs_devices->opened && found_transid < device->generation) { 908 /* 909 * That is if the FS is _not_ mounted and if you 910 * are here, that means there is more than one 911 * disk with same uuid and devid.We keep the one 912 * with larger generation number or the last-in if 913 * generation are equal. 914 */ 915 mutex_unlock(&fs_devices->device_list_mutex); 916 return ERR_PTR(-EEXIST); 917 } 918 919 /* 920 * We are going to replace the device path for a given devid, 921 * make sure it's the same device if the device is mounted 922 */ 923 if (device->bdev) { 924 int error; 925 dev_t path_dev; 926 927 error = lookup_bdev(path, &path_dev); 928 if (error) { 929 mutex_unlock(&fs_devices->device_list_mutex); 930 return ERR_PTR(error); 931 } 932 933 if (device->bdev->bd_dev != path_dev) { 934 mutex_unlock(&fs_devices->device_list_mutex); 935 /* 936 * device->fs_info may not be reliable here, so 937 * pass in a NULL instead. This avoids a 938 * possible use-after-free when the fs_info and 939 * fs_info->sb are already torn down. 940 */ 941 btrfs_warn_in_rcu(NULL, 942 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 943 path, devid, found_transid, 944 current->comm, 945 task_pid_nr(current)); 946 return ERR_PTR(-EEXIST); 947 } 948 btrfs_info_in_rcu(device->fs_info, 949 "devid %llu device path %s changed to %s scanned by %s (%d)", 950 devid, rcu_str_deref(device->name), 951 path, current->comm, 952 task_pid_nr(current)); 953 } 954 955 name = rcu_string_strdup(path, GFP_NOFS); 956 if (!name) { 957 mutex_unlock(&fs_devices->device_list_mutex); 958 return ERR_PTR(-ENOMEM); 959 } 960 rcu_string_free(device->name); 961 rcu_assign_pointer(device->name, name); 962 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 963 fs_devices->missing_devices--; 964 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 965 } 966 } 967 968 /* 969 * Unmount does not free the btrfs_device struct but would zero 970 * generation along with most of the other members. So just update 971 * it back. We need it to pick the disk with largest generation 972 * (as above). 973 */ 974 if (!fs_devices->opened) { 975 device->generation = found_transid; 976 fs_devices->latest_generation = max_t(u64, found_transid, 977 fs_devices->latest_generation); 978 } 979 980 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 981 982 mutex_unlock(&fs_devices->device_list_mutex); 983 return device; 984 } 985 986 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 987 { 988 struct btrfs_fs_devices *fs_devices; 989 struct btrfs_device *device; 990 struct btrfs_device *orig_dev; 991 int ret = 0; 992 993 lockdep_assert_held(&uuid_mutex); 994 995 fs_devices = alloc_fs_devices(orig->fsid, NULL); 996 if (IS_ERR(fs_devices)) 997 return fs_devices; 998 999 fs_devices->total_devices = orig->total_devices; 1000 1001 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1002 struct rcu_string *name; 1003 1004 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1005 orig_dev->uuid); 1006 if (IS_ERR(device)) { 1007 ret = PTR_ERR(device); 1008 goto error; 1009 } 1010 1011 /* 1012 * This is ok to do without rcu read locked because we hold the 1013 * uuid mutex so nothing we touch in here is going to disappear. 1014 */ 1015 if (orig_dev->name) { 1016 name = rcu_string_strdup(orig_dev->name->str, 1017 GFP_KERNEL); 1018 if (!name) { 1019 btrfs_free_device(device); 1020 ret = -ENOMEM; 1021 goto error; 1022 } 1023 rcu_assign_pointer(device->name, name); 1024 } 1025 1026 list_add(&device->dev_list, &fs_devices->devices); 1027 device->fs_devices = fs_devices; 1028 fs_devices->num_devices++; 1029 } 1030 return fs_devices; 1031 error: 1032 free_fs_devices(fs_devices); 1033 return ERR_PTR(ret); 1034 } 1035 1036 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1037 struct btrfs_device **latest_dev) 1038 { 1039 struct btrfs_device *device, *next; 1040 1041 /* This is the initialized path, it is safe to release the devices. */ 1042 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1043 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1044 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1045 &device->dev_state) && 1046 !test_bit(BTRFS_DEV_STATE_MISSING, 1047 &device->dev_state) && 1048 (!*latest_dev || 1049 device->generation > (*latest_dev)->generation)) { 1050 *latest_dev = device; 1051 } 1052 continue; 1053 } 1054 1055 /* 1056 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1057 * in btrfs_init_dev_replace() so just continue. 1058 */ 1059 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1060 continue; 1061 1062 if (device->bdev) { 1063 blkdev_put(device->bdev, device->mode); 1064 device->bdev = NULL; 1065 fs_devices->open_devices--; 1066 } 1067 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1068 list_del_init(&device->dev_alloc_list); 1069 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1070 fs_devices->rw_devices--; 1071 } 1072 list_del_init(&device->dev_list); 1073 fs_devices->num_devices--; 1074 btrfs_free_device(device); 1075 } 1076 1077 } 1078 1079 /* 1080 * After we have read the system tree and know devids belonging to this 1081 * filesystem, remove the device which does not belong there. 1082 */ 1083 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1084 { 1085 struct btrfs_device *latest_dev = NULL; 1086 struct btrfs_fs_devices *seed_dev; 1087 1088 mutex_lock(&uuid_mutex); 1089 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1090 1091 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1092 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1093 1094 fs_devices->latest_bdev = latest_dev->bdev; 1095 1096 mutex_unlock(&uuid_mutex); 1097 } 1098 1099 static void btrfs_close_bdev(struct btrfs_device *device) 1100 { 1101 if (!device->bdev) 1102 return; 1103 1104 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1105 sync_blockdev(device->bdev); 1106 invalidate_bdev(device->bdev); 1107 } 1108 1109 blkdev_put(device->bdev, device->mode); 1110 } 1111 1112 static void btrfs_close_one_device(struct btrfs_device *device) 1113 { 1114 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1115 1116 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1117 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1118 list_del_init(&device->dev_alloc_list); 1119 fs_devices->rw_devices--; 1120 } 1121 1122 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1123 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1124 1125 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1126 fs_devices->missing_devices--; 1127 1128 btrfs_close_bdev(device); 1129 if (device->bdev) { 1130 fs_devices->open_devices--; 1131 device->bdev = NULL; 1132 } 1133 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1134 btrfs_destroy_dev_zone_info(device); 1135 1136 device->fs_info = NULL; 1137 atomic_set(&device->dev_stats_ccnt, 0); 1138 extent_io_tree_release(&device->alloc_state); 1139 1140 /* 1141 * Reset the flush error record. We might have a transient flush error 1142 * in this mount, and if so we aborted the current transaction and set 1143 * the fs to an error state, guaranteeing no super blocks can be further 1144 * committed. However that error might be transient and if we unmount the 1145 * filesystem and mount it again, we should allow the mount to succeed 1146 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1147 * filesystem again we still get flush errors, then we will again abort 1148 * any transaction and set the error state, guaranteeing no commits of 1149 * unsafe super blocks. 1150 */ 1151 device->last_flush_error = 0; 1152 1153 /* Verify the device is back in a pristine state */ 1154 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1155 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1156 ASSERT(list_empty(&device->dev_alloc_list)); 1157 ASSERT(list_empty(&device->post_commit_list)); 1158 ASSERT(atomic_read(&device->reada_in_flight) == 0); 1159 } 1160 1161 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1162 { 1163 struct btrfs_device *device, *tmp; 1164 1165 lockdep_assert_held(&uuid_mutex); 1166 1167 if (--fs_devices->opened > 0) 1168 return; 1169 1170 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1171 btrfs_close_one_device(device); 1172 1173 WARN_ON(fs_devices->open_devices); 1174 WARN_ON(fs_devices->rw_devices); 1175 fs_devices->opened = 0; 1176 fs_devices->seeding = false; 1177 fs_devices->fs_info = NULL; 1178 } 1179 1180 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1181 { 1182 LIST_HEAD(list); 1183 struct btrfs_fs_devices *tmp; 1184 1185 mutex_lock(&uuid_mutex); 1186 close_fs_devices(fs_devices); 1187 if (!fs_devices->opened) 1188 list_splice_init(&fs_devices->seed_list, &list); 1189 1190 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1191 close_fs_devices(fs_devices); 1192 list_del(&fs_devices->seed_list); 1193 free_fs_devices(fs_devices); 1194 } 1195 mutex_unlock(&uuid_mutex); 1196 } 1197 1198 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1199 fmode_t flags, void *holder) 1200 { 1201 struct btrfs_device *device; 1202 struct btrfs_device *latest_dev = NULL; 1203 struct btrfs_device *tmp_device; 1204 1205 flags |= FMODE_EXCL; 1206 1207 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1208 dev_list) { 1209 int ret; 1210 1211 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1212 if (ret == 0 && 1213 (!latest_dev || device->generation > latest_dev->generation)) { 1214 latest_dev = device; 1215 } else if (ret == -ENODATA) { 1216 fs_devices->num_devices--; 1217 list_del(&device->dev_list); 1218 btrfs_free_device(device); 1219 } 1220 } 1221 if (fs_devices->open_devices == 0) 1222 return -EINVAL; 1223 1224 fs_devices->opened = 1; 1225 fs_devices->latest_bdev = latest_dev->bdev; 1226 fs_devices->total_rw_bytes = 0; 1227 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1228 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1229 1230 return 0; 1231 } 1232 1233 static int devid_cmp(void *priv, const struct list_head *a, 1234 const struct list_head *b) 1235 { 1236 const struct btrfs_device *dev1, *dev2; 1237 1238 dev1 = list_entry(a, struct btrfs_device, dev_list); 1239 dev2 = list_entry(b, struct btrfs_device, dev_list); 1240 1241 if (dev1->devid < dev2->devid) 1242 return -1; 1243 else if (dev1->devid > dev2->devid) 1244 return 1; 1245 return 0; 1246 } 1247 1248 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1249 fmode_t flags, void *holder) 1250 { 1251 int ret; 1252 1253 lockdep_assert_held(&uuid_mutex); 1254 /* 1255 * The device_list_mutex cannot be taken here in case opening the 1256 * underlying device takes further locks like open_mutex. 1257 * 1258 * We also don't need the lock here as this is called during mount and 1259 * exclusion is provided by uuid_mutex 1260 */ 1261 1262 if (fs_devices->opened) { 1263 fs_devices->opened++; 1264 ret = 0; 1265 } else { 1266 list_sort(NULL, &fs_devices->devices, devid_cmp); 1267 ret = open_fs_devices(fs_devices, flags, holder); 1268 } 1269 1270 return ret; 1271 } 1272 1273 void btrfs_release_disk_super(struct btrfs_super_block *super) 1274 { 1275 struct page *page = virt_to_page(super); 1276 1277 put_page(page); 1278 } 1279 1280 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1281 u64 bytenr, u64 bytenr_orig) 1282 { 1283 struct btrfs_super_block *disk_super; 1284 struct page *page; 1285 void *p; 1286 pgoff_t index; 1287 1288 /* make sure our super fits in the device */ 1289 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1290 return ERR_PTR(-EINVAL); 1291 1292 /* make sure our super fits in the page */ 1293 if (sizeof(*disk_super) > PAGE_SIZE) 1294 return ERR_PTR(-EINVAL); 1295 1296 /* make sure our super doesn't straddle pages on disk */ 1297 index = bytenr >> PAGE_SHIFT; 1298 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1299 return ERR_PTR(-EINVAL); 1300 1301 /* pull in the page with our super */ 1302 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1303 1304 if (IS_ERR(page)) 1305 return ERR_CAST(page); 1306 1307 p = page_address(page); 1308 1309 /* align our pointer to the offset of the super block */ 1310 disk_super = p + offset_in_page(bytenr); 1311 1312 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1313 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1314 btrfs_release_disk_super(p); 1315 return ERR_PTR(-EINVAL); 1316 } 1317 1318 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1319 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1320 1321 return disk_super; 1322 } 1323 1324 int btrfs_forget_devices(const char *path) 1325 { 1326 int ret; 1327 1328 mutex_lock(&uuid_mutex); 1329 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1330 mutex_unlock(&uuid_mutex); 1331 1332 return ret; 1333 } 1334 1335 /* 1336 * Look for a btrfs signature on a device. This may be called out of the mount path 1337 * and we are not allowed to call set_blocksize during the scan. The superblock 1338 * is read via pagecache 1339 */ 1340 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1341 void *holder) 1342 { 1343 struct btrfs_super_block *disk_super; 1344 bool new_device_added = false; 1345 struct btrfs_device *device = NULL; 1346 struct block_device *bdev; 1347 u64 bytenr, bytenr_orig; 1348 int ret; 1349 1350 lockdep_assert_held(&uuid_mutex); 1351 1352 /* 1353 * we would like to check all the supers, but that would make 1354 * a btrfs mount succeed after a mkfs from a different FS. 1355 * So, we need to add a special mount option to scan for 1356 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1357 */ 1358 flags |= FMODE_EXCL; 1359 1360 bdev = blkdev_get_by_path(path, flags, holder); 1361 if (IS_ERR(bdev)) 1362 return ERR_CAST(bdev); 1363 1364 bytenr_orig = btrfs_sb_offset(0); 1365 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1366 if (ret) 1367 return ERR_PTR(ret); 1368 1369 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1370 if (IS_ERR(disk_super)) { 1371 device = ERR_CAST(disk_super); 1372 goto error_bdev_put; 1373 } 1374 1375 device = device_list_add(path, disk_super, &new_device_added); 1376 if (!IS_ERR(device)) { 1377 if (new_device_added) 1378 btrfs_free_stale_devices(path, device); 1379 } 1380 1381 btrfs_release_disk_super(disk_super); 1382 1383 error_bdev_put: 1384 blkdev_put(bdev, flags); 1385 1386 return device; 1387 } 1388 1389 /* 1390 * Try to find a chunk that intersects [start, start + len] range and when one 1391 * such is found, record the end of it in *start 1392 */ 1393 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1394 u64 len) 1395 { 1396 u64 physical_start, physical_end; 1397 1398 lockdep_assert_held(&device->fs_info->chunk_mutex); 1399 1400 if (!find_first_extent_bit(&device->alloc_state, *start, 1401 &physical_start, &physical_end, 1402 CHUNK_ALLOCATED, NULL)) { 1403 1404 if (in_range(physical_start, *start, len) || 1405 in_range(*start, physical_start, 1406 physical_end - physical_start)) { 1407 *start = physical_end + 1; 1408 return true; 1409 } 1410 } 1411 return false; 1412 } 1413 1414 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1415 { 1416 switch (device->fs_devices->chunk_alloc_policy) { 1417 case BTRFS_CHUNK_ALLOC_REGULAR: 1418 /* 1419 * We don't want to overwrite the superblock on the drive nor 1420 * any area used by the boot loader (grub for example), so we 1421 * make sure to start at an offset of at least 1MB. 1422 */ 1423 return max_t(u64, start, SZ_1M); 1424 case BTRFS_CHUNK_ALLOC_ZONED: 1425 /* 1426 * We don't care about the starting region like regular 1427 * allocator, because we anyway use/reserve the first two zones 1428 * for superblock logging. 1429 */ 1430 return ALIGN(start, device->zone_info->zone_size); 1431 default: 1432 BUG(); 1433 } 1434 } 1435 1436 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1437 u64 *hole_start, u64 *hole_size, 1438 u64 num_bytes) 1439 { 1440 u64 zone_size = device->zone_info->zone_size; 1441 u64 pos; 1442 int ret; 1443 bool changed = false; 1444 1445 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1446 1447 while (*hole_size > 0) { 1448 pos = btrfs_find_allocatable_zones(device, *hole_start, 1449 *hole_start + *hole_size, 1450 num_bytes); 1451 if (pos != *hole_start) { 1452 *hole_size = *hole_start + *hole_size - pos; 1453 *hole_start = pos; 1454 changed = true; 1455 if (*hole_size < num_bytes) 1456 break; 1457 } 1458 1459 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1460 1461 /* Range is ensured to be empty */ 1462 if (!ret) 1463 return changed; 1464 1465 /* Given hole range was invalid (outside of device) */ 1466 if (ret == -ERANGE) { 1467 *hole_start += *hole_size; 1468 *hole_size = 0; 1469 return true; 1470 } 1471 1472 *hole_start += zone_size; 1473 *hole_size -= zone_size; 1474 changed = true; 1475 } 1476 1477 return changed; 1478 } 1479 1480 /** 1481 * dev_extent_hole_check - check if specified hole is suitable for allocation 1482 * @device: the device which we have the hole 1483 * @hole_start: starting position of the hole 1484 * @hole_size: the size of the hole 1485 * @num_bytes: the size of the free space that we need 1486 * 1487 * This function may modify @hole_start and @hole_size to reflect the suitable 1488 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1489 */ 1490 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1491 u64 *hole_size, u64 num_bytes) 1492 { 1493 bool changed = false; 1494 u64 hole_end = *hole_start + *hole_size; 1495 1496 for (;;) { 1497 /* 1498 * Check before we set max_hole_start, otherwise we could end up 1499 * sending back this offset anyway. 1500 */ 1501 if (contains_pending_extent(device, hole_start, *hole_size)) { 1502 if (hole_end >= *hole_start) 1503 *hole_size = hole_end - *hole_start; 1504 else 1505 *hole_size = 0; 1506 changed = true; 1507 } 1508 1509 switch (device->fs_devices->chunk_alloc_policy) { 1510 case BTRFS_CHUNK_ALLOC_REGULAR: 1511 /* No extra check */ 1512 break; 1513 case BTRFS_CHUNK_ALLOC_ZONED: 1514 if (dev_extent_hole_check_zoned(device, hole_start, 1515 hole_size, num_bytes)) { 1516 changed = true; 1517 /* 1518 * The changed hole can contain pending extent. 1519 * Loop again to check that. 1520 */ 1521 continue; 1522 } 1523 break; 1524 default: 1525 BUG(); 1526 } 1527 1528 break; 1529 } 1530 1531 return changed; 1532 } 1533 1534 /* 1535 * find_free_dev_extent_start - find free space in the specified device 1536 * @device: the device which we search the free space in 1537 * @num_bytes: the size of the free space that we need 1538 * @search_start: the position from which to begin the search 1539 * @start: store the start of the free space. 1540 * @len: the size of the free space. that we find, or the size 1541 * of the max free space if we don't find suitable free space 1542 * 1543 * this uses a pretty simple search, the expectation is that it is 1544 * called very infrequently and that a given device has a small number 1545 * of extents 1546 * 1547 * @start is used to store the start of the free space if we find. But if we 1548 * don't find suitable free space, it will be used to store the start position 1549 * of the max free space. 1550 * 1551 * @len is used to store the size of the free space that we find. 1552 * But if we don't find suitable free space, it is used to store the size of 1553 * the max free space. 1554 * 1555 * NOTE: This function will search *commit* root of device tree, and does extra 1556 * check to ensure dev extents are not double allocated. 1557 * This makes the function safe to allocate dev extents but may not report 1558 * correct usable device space, as device extent freed in current transaction 1559 * is not reported as available. 1560 */ 1561 static int find_free_dev_extent_start(struct btrfs_device *device, 1562 u64 num_bytes, u64 search_start, u64 *start, 1563 u64 *len) 1564 { 1565 struct btrfs_fs_info *fs_info = device->fs_info; 1566 struct btrfs_root *root = fs_info->dev_root; 1567 struct btrfs_key key; 1568 struct btrfs_dev_extent *dev_extent; 1569 struct btrfs_path *path; 1570 u64 hole_size; 1571 u64 max_hole_start; 1572 u64 max_hole_size; 1573 u64 extent_end; 1574 u64 search_end = device->total_bytes; 1575 int ret; 1576 int slot; 1577 struct extent_buffer *l; 1578 1579 search_start = dev_extent_search_start(device, search_start); 1580 1581 WARN_ON(device->zone_info && 1582 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1583 1584 path = btrfs_alloc_path(); 1585 if (!path) 1586 return -ENOMEM; 1587 1588 max_hole_start = search_start; 1589 max_hole_size = 0; 1590 1591 again: 1592 if (search_start >= search_end || 1593 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1594 ret = -ENOSPC; 1595 goto out; 1596 } 1597 1598 path->reada = READA_FORWARD; 1599 path->search_commit_root = 1; 1600 path->skip_locking = 1; 1601 1602 key.objectid = device->devid; 1603 key.offset = search_start; 1604 key.type = BTRFS_DEV_EXTENT_KEY; 1605 1606 ret = btrfs_search_backwards(root, &key, path); 1607 if (ret < 0) 1608 goto out; 1609 1610 while (1) { 1611 l = path->nodes[0]; 1612 slot = path->slots[0]; 1613 if (slot >= btrfs_header_nritems(l)) { 1614 ret = btrfs_next_leaf(root, path); 1615 if (ret == 0) 1616 continue; 1617 if (ret < 0) 1618 goto out; 1619 1620 break; 1621 } 1622 btrfs_item_key_to_cpu(l, &key, slot); 1623 1624 if (key.objectid < device->devid) 1625 goto next; 1626 1627 if (key.objectid > device->devid) 1628 break; 1629 1630 if (key.type != BTRFS_DEV_EXTENT_KEY) 1631 goto next; 1632 1633 if (key.offset > search_start) { 1634 hole_size = key.offset - search_start; 1635 dev_extent_hole_check(device, &search_start, &hole_size, 1636 num_bytes); 1637 1638 if (hole_size > max_hole_size) { 1639 max_hole_start = search_start; 1640 max_hole_size = hole_size; 1641 } 1642 1643 /* 1644 * If this free space is greater than which we need, 1645 * it must be the max free space that we have found 1646 * until now, so max_hole_start must point to the start 1647 * of this free space and the length of this free space 1648 * is stored in max_hole_size. Thus, we return 1649 * max_hole_start and max_hole_size and go back to the 1650 * caller. 1651 */ 1652 if (hole_size >= num_bytes) { 1653 ret = 0; 1654 goto out; 1655 } 1656 } 1657 1658 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1659 extent_end = key.offset + btrfs_dev_extent_length(l, 1660 dev_extent); 1661 if (extent_end > search_start) 1662 search_start = extent_end; 1663 next: 1664 path->slots[0]++; 1665 cond_resched(); 1666 } 1667 1668 /* 1669 * At this point, search_start should be the end of 1670 * allocated dev extents, and when shrinking the device, 1671 * search_end may be smaller than search_start. 1672 */ 1673 if (search_end > search_start) { 1674 hole_size = search_end - search_start; 1675 if (dev_extent_hole_check(device, &search_start, &hole_size, 1676 num_bytes)) { 1677 btrfs_release_path(path); 1678 goto again; 1679 } 1680 1681 if (hole_size > max_hole_size) { 1682 max_hole_start = search_start; 1683 max_hole_size = hole_size; 1684 } 1685 } 1686 1687 /* See above. */ 1688 if (max_hole_size < num_bytes) 1689 ret = -ENOSPC; 1690 else 1691 ret = 0; 1692 1693 out: 1694 btrfs_free_path(path); 1695 *start = max_hole_start; 1696 if (len) 1697 *len = max_hole_size; 1698 return ret; 1699 } 1700 1701 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1702 u64 *start, u64 *len) 1703 { 1704 /* FIXME use last free of some kind */ 1705 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1706 } 1707 1708 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1709 struct btrfs_device *device, 1710 u64 start, u64 *dev_extent_len) 1711 { 1712 struct btrfs_fs_info *fs_info = device->fs_info; 1713 struct btrfs_root *root = fs_info->dev_root; 1714 int ret; 1715 struct btrfs_path *path; 1716 struct btrfs_key key; 1717 struct btrfs_key found_key; 1718 struct extent_buffer *leaf = NULL; 1719 struct btrfs_dev_extent *extent = NULL; 1720 1721 path = btrfs_alloc_path(); 1722 if (!path) 1723 return -ENOMEM; 1724 1725 key.objectid = device->devid; 1726 key.offset = start; 1727 key.type = BTRFS_DEV_EXTENT_KEY; 1728 again: 1729 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1730 if (ret > 0) { 1731 ret = btrfs_previous_item(root, path, key.objectid, 1732 BTRFS_DEV_EXTENT_KEY); 1733 if (ret) 1734 goto out; 1735 leaf = path->nodes[0]; 1736 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1737 extent = btrfs_item_ptr(leaf, path->slots[0], 1738 struct btrfs_dev_extent); 1739 BUG_ON(found_key.offset > start || found_key.offset + 1740 btrfs_dev_extent_length(leaf, extent) < start); 1741 key = found_key; 1742 btrfs_release_path(path); 1743 goto again; 1744 } else if (ret == 0) { 1745 leaf = path->nodes[0]; 1746 extent = btrfs_item_ptr(leaf, path->slots[0], 1747 struct btrfs_dev_extent); 1748 } else { 1749 goto out; 1750 } 1751 1752 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1753 1754 ret = btrfs_del_item(trans, root, path); 1755 if (ret == 0) 1756 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1757 out: 1758 btrfs_free_path(path); 1759 return ret; 1760 } 1761 1762 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1763 { 1764 struct extent_map_tree *em_tree; 1765 struct extent_map *em; 1766 struct rb_node *n; 1767 u64 ret = 0; 1768 1769 em_tree = &fs_info->mapping_tree; 1770 read_lock(&em_tree->lock); 1771 n = rb_last(&em_tree->map.rb_root); 1772 if (n) { 1773 em = rb_entry(n, struct extent_map, rb_node); 1774 ret = em->start + em->len; 1775 } 1776 read_unlock(&em_tree->lock); 1777 1778 return ret; 1779 } 1780 1781 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1782 u64 *devid_ret) 1783 { 1784 int ret; 1785 struct btrfs_key key; 1786 struct btrfs_key found_key; 1787 struct btrfs_path *path; 1788 1789 path = btrfs_alloc_path(); 1790 if (!path) 1791 return -ENOMEM; 1792 1793 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1794 key.type = BTRFS_DEV_ITEM_KEY; 1795 key.offset = (u64)-1; 1796 1797 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1798 if (ret < 0) 1799 goto error; 1800 1801 if (ret == 0) { 1802 /* Corruption */ 1803 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1804 ret = -EUCLEAN; 1805 goto error; 1806 } 1807 1808 ret = btrfs_previous_item(fs_info->chunk_root, path, 1809 BTRFS_DEV_ITEMS_OBJECTID, 1810 BTRFS_DEV_ITEM_KEY); 1811 if (ret) { 1812 *devid_ret = 1; 1813 } else { 1814 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1815 path->slots[0]); 1816 *devid_ret = found_key.offset + 1; 1817 } 1818 ret = 0; 1819 error: 1820 btrfs_free_path(path); 1821 return ret; 1822 } 1823 1824 /* 1825 * the device information is stored in the chunk root 1826 * the btrfs_device struct should be fully filled in 1827 */ 1828 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1829 struct btrfs_device *device) 1830 { 1831 int ret; 1832 struct btrfs_path *path; 1833 struct btrfs_dev_item *dev_item; 1834 struct extent_buffer *leaf; 1835 struct btrfs_key key; 1836 unsigned long ptr; 1837 1838 path = btrfs_alloc_path(); 1839 if (!path) 1840 return -ENOMEM; 1841 1842 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1843 key.type = BTRFS_DEV_ITEM_KEY; 1844 key.offset = device->devid; 1845 1846 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1847 &key, sizeof(*dev_item)); 1848 if (ret) 1849 goto out; 1850 1851 leaf = path->nodes[0]; 1852 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1853 1854 btrfs_set_device_id(leaf, dev_item, device->devid); 1855 btrfs_set_device_generation(leaf, dev_item, 0); 1856 btrfs_set_device_type(leaf, dev_item, device->type); 1857 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1858 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1859 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1860 btrfs_set_device_total_bytes(leaf, dev_item, 1861 btrfs_device_get_disk_total_bytes(device)); 1862 btrfs_set_device_bytes_used(leaf, dev_item, 1863 btrfs_device_get_bytes_used(device)); 1864 btrfs_set_device_group(leaf, dev_item, 0); 1865 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1866 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1867 btrfs_set_device_start_offset(leaf, dev_item, 0); 1868 1869 ptr = btrfs_device_uuid(dev_item); 1870 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1871 ptr = btrfs_device_fsid(dev_item); 1872 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1873 ptr, BTRFS_FSID_SIZE); 1874 btrfs_mark_buffer_dirty(leaf); 1875 1876 ret = 0; 1877 out: 1878 btrfs_free_path(path); 1879 return ret; 1880 } 1881 1882 /* 1883 * Function to update ctime/mtime for a given device path. 1884 * Mainly used for ctime/mtime based probe like libblkid. 1885 */ 1886 static void update_dev_time(struct block_device *bdev) 1887 { 1888 struct inode *inode = bdev->bd_inode; 1889 struct timespec64 now; 1890 1891 /* Shouldn't happen but just in case. */ 1892 if (!inode) 1893 return; 1894 1895 now = current_time(inode); 1896 generic_update_time(inode, &now, S_MTIME | S_CTIME); 1897 } 1898 1899 static int btrfs_rm_dev_item(struct btrfs_device *device) 1900 { 1901 struct btrfs_root *root = device->fs_info->chunk_root; 1902 int ret; 1903 struct btrfs_path *path; 1904 struct btrfs_key key; 1905 struct btrfs_trans_handle *trans; 1906 1907 path = btrfs_alloc_path(); 1908 if (!path) 1909 return -ENOMEM; 1910 1911 trans = btrfs_start_transaction(root, 0); 1912 if (IS_ERR(trans)) { 1913 btrfs_free_path(path); 1914 return PTR_ERR(trans); 1915 } 1916 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1917 key.type = BTRFS_DEV_ITEM_KEY; 1918 key.offset = device->devid; 1919 1920 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1921 if (ret) { 1922 if (ret > 0) 1923 ret = -ENOENT; 1924 btrfs_abort_transaction(trans, ret); 1925 btrfs_end_transaction(trans); 1926 goto out; 1927 } 1928 1929 ret = btrfs_del_item(trans, root, path); 1930 if (ret) { 1931 btrfs_abort_transaction(trans, ret); 1932 btrfs_end_transaction(trans); 1933 } 1934 1935 out: 1936 btrfs_free_path(path); 1937 if (!ret) 1938 ret = btrfs_commit_transaction(trans); 1939 return ret; 1940 } 1941 1942 /* 1943 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1944 * filesystem. It's up to the caller to adjust that number regarding eg. device 1945 * replace. 1946 */ 1947 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1948 u64 num_devices) 1949 { 1950 u64 all_avail; 1951 unsigned seq; 1952 int i; 1953 1954 do { 1955 seq = read_seqbegin(&fs_info->profiles_lock); 1956 1957 all_avail = fs_info->avail_data_alloc_bits | 1958 fs_info->avail_system_alloc_bits | 1959 fs_info->avail_metadata_alloc_bits; 1960 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1961 1962 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1963 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1964 continue; 1965 1966 if (num_devices < btrfs_raid_array[i].devs_min) 1967 return btrfs_raid_array[i].mindev_error; 1968 } 1969 1970 return 0; 1971 } 1972 1973 static struct btrfs_device * btrfs_find_next_active_device( 1974 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1975 { 1976 struct btrfs_device *next_device; 1977 1978 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1979 if (next_device != device && 1980 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1981 && next_device->bdev) 1982 return next_device; 1983 } 1984 1985 return NULL; 1986 } 1987 1988 /* 1989 * Helper function to check if the given device is part of s_bdev / latest_bdev 1990 * and replace it with the provided or the next active device, in the context 1991 * where this function called, there should be always be another device (or 1992 * this_dev) which is active. 1993 */ 1994 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1995 struct btrfs_device *next_device) 1996 { 1997 struct btrfs_fs_info *fs_info = device->fs_info; 1998 1999 if (!next_device) 2000 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2001 device); 2002 ASSERT(next_device); 2003 2004 if (fs_info->sb->s_bdev && 2005 (fs_info->sb->s_bdev == device->bdev)) 2006 fs_info->sb->s_bdev = next_device->bdev; 2007 2008 if (fs_info->fs_devices->latest_bdev == device->bdev) 2009 fs_info->fs_devices->latest_bdev = next_device->bdev; 2010 } 2011 2012 /* 2013 * Return btrfs_fs_devices::num_devices excluding the device that's being 2014 * currently replaced. 2015 */ 2016 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2017 { 2018 u64 num_devices = fs_info->fs_devices->num_devices; 2019 2020 down_read(&fs_info->dev_replace.rwsem); 2021 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2022 ASSERT(num_devices > 1); 2023 num_devices--; 2024 } 2025 up_read(&fs_info->dev_replace.rwsem); 2026 2027 return num_devices; 2028 } 2029 2030 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2031 struct block_device *bdev, 2032 const char *device_path) 2033 { 2034 struct btrfs_super_block *disk_super; 2035 int copy_num; 2036 2037 if (!bdev) 2038 return; 2039 2040 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2041 struct page *page; 2042 int ret; 2043 2044 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2045 if (IS_ERR(disk_super)) 2046 continue; 2047 2048 if (bdev_is_zoned(bdev)) { 2049 btrfs_reset_sb_log_zones(bdev, copy_num); 2050 continue; 2051 } 2052 2053 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2054 2055 page = virt_to_page(disk_super); 2056 set_page_dirty(page); 2057 lock_page(page); 2058 /* write_on_page() unlocks the page */ 2059 ret = write_one_page(page); 2060 if (ret) 2061 btrfs_warn(fs_info, 2062 "error clearing superblock number %d (%d)", 2063 copy_num, ret); 2064 btrfs_release_disk_super(disk_super); 2065 2066 } 2067 2068 /* Notify udev that device has changed */ 2069 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2070 2071 /* Update ctime/mtime for device path for libblkid */ 2072 update_dev_time(bdev); 2073 } 2074 2075 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2076 u64 devid, struct block_device **bdev, fmode_t *mode) 2077 { 2078 struct btrfs_device *device; 2079 struct btrfs_fs_devices *cur_devices; 2080 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2081 u64 num_devices; 2082 int ret = 0; 2083 2084 mutex_lock(&uuid_mutex); 2085 2086 num_devices = btrfs_num_devices(fs_info); 2087 2088 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2089 if (ret) 2090 goto out; 2091 2092 device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2093 2094 if (IS_ERR(device)) { 2095 if (PTR_ERR(device) == -ENOENT && 2096 device_path && strcmp(device_path, "missing") == 0) 2097 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2098 else 2099 ret = PTR_ERR(device); 2100 goto out; 2101 } 2102 2103 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2104 btrfs_warn_in_rcu(fs_info, 2105 "cannot remove device %s (devid %llu) due to active swapfile", 2106 rcu_str_deref(device->name), device->devid); 2107 ret = -ETXTBSY; 2108 goto out; 2109 } 2110 2111 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2112 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2113 goto out; 2114 } 2115 2116 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2117 fs_info->fs_devices->rw_devices == 1) { 2118 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2119 goto out; 2120 } 2121 2122 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2123 mutex_lock(&fs_info->chunk_mutex); 2124 list_del_init(&device->dev_alloc_list); 2125 device->fs_devices->rw_devices--; 2126 mutex_unlock(&fs_info->chunk_mutex); 2127 } 2128 2129 mutex_unlock(&uuid_mutex); 2130 ret = btrfs_shrink_device(device, 0); 2131 if (!ret) 2132 btrfs_reada_remove_dev(device); 2133 mutex_lock(&uuid_mutex); 2134 if (ret) 2135 goto error_undo; 2136 2137 /* 2138 * TODO: the superblock still includes this device in its num_devices 2139 * counter although write_all_supers() is not locked out. This 2140 * could give a filesystem state which requires a degraded mount. 2141 */ 2142 ret = btrfs_rm_dev_item(device); 2143 if (ret) 2144 goto error_undo; 2145 2146 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2147 btrfs_scrub_cancel_dev(device); 2148 2149 /* 2150 * the device list mutex makes sure that we don't change 2151 * the device list while someone else is writing out all 2152 * the device supers. Whoever is writing all supers, should 2153 * lock the device list mutex before getting the number of 2154 * devices in the super block (super_copy). Conversely, 2155 * whoever updates the number of devices in the super block 2156 * (super_copy) should hold the device list mutex. 2157 */ 2158 2159 /* 2160 * In normal cases the cur_devices == fs_devices. But in case 2161 * of deleting a seed device, the cur_devices should point to 2162 * its own fs_devices listed under the fs_devices->seed. 2163 */ 2164 cur_devices = device->fs_devices; 2165 mutex_lock(&fs_devices->device_list_mutex); 2166 list_del_rcu(&device->dev_list); 2167 2168 cur_devices->num_devices--; 2169 cur_devices->total_devices--; 2170 /* Update total_devices of the parent fs_devices if it's seed */ 2171 if (cur_devices != fs_devices) 2172 fs_devices->total_devices--; 2173 2174 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2175 cur_devices->missing_devices--; 2176 2177 btrfs_assign_next_active_device(device, NULL); 2178 2179 if (device->bdev) { 2180 cur_devices->open_devices--; 2181 /* remove sysfs entry */ 2182 btrfs_sysfs_remove_device(device); 2183 } 2184 2185 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2186 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2187 mutex_unlock(&fs_devices->device_list_mutex); 2188 2189 /* 2190 * At this point, the device is zero sized and detached from the 2191 * devices list. All that's left is to zero out the old supers and 2192 * free the device. 2193 * 2194 * We cannot call btrfs_close_bdev() here because we're holding the sb 2195 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2196 * block device and it's dependencies. Instead just flush the device 2197 * and let the caller do the final blkdev_put. 2198 */ 2199 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2200 btrfs_scratch_superblocks(fs_info, device->bdev, 2201 device->name->str); 2202 if (device->bdev) { 2203 sync_blockdev(device->bdev); 2204 invalidate_bdev(device->bdev); 2205 } 2206 } 2207 2208 *bdev = device->bdev; 2209 *mode = device->mode; 2210 synchronize_rcu(); 2211 btrfs_free_device(device); 2212 2213 if (cur_devices->open_devices == 0) { 2214 list_del_init(&cur_devices->seed_list); 2215 close_fs_devices(cur_devices); 2216 free_fs_devices(cur_devices); 2217 } 2218 2219 out: 2220 mutex_unlock(&uuid_mutex); 2221 return ret; 2222 2223 error_undo: 2224 btrfs_reada_undo_remove_dev(device); 2225 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2226 mutex_lock(&fs_info->chunk_mutex); 2227 list_add(&device->dev_alloc_list, 2228 &fs_devices->alloc_list); 2229 device->fs_devices->rw_devices++; 2230 mutex_unlock(&fs_info->chunk_mutex); 2231 } 2232 goto out; 2233 } 2234 2235 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2236 { 2237 struct btrfs_fs_devices *fs_devices; 2238 2239 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2240 2241 /* 2242 * in case of fs with no seed, srcdev->fs_devices will point 2243 * to fs_devices of fs_info. However when the dev being replaced is 2244 * a seed dev it will point to the seed's local fs_devices. In short 2245 * srcdev will have its correct fs_devices in both the cases. 2246 */ 2247 fs_devices = srcdev->fs_devices; 2248 2249 list_del_rcu(&srcdev->dev_list); 2250 list_del(&srcdev->dev_alloc_list); 2251 fs_devices->num_devices--; 2252 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2253 fs_devices->missing_devices--; 2254 2255 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2256 fs_devices->rw_devices--; 2257 2258 if (srcdev->bdev) 2259 fs_devices->open_devices--; 2260 } 2261 2262 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2263 { 2264 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2265 2266 mutex_lock(&uuid_mutex); 2267 2268 btrfs_close_bdev(srcdev); 2269 synchronize_rcu(); 2270 btrfs_free_device(srcdev); 2271 2272 /* if this is no devs we rather delete the fs_devices */ 2273 if (!fs_devices->num_devices) { 2274 /* 2275 * On a mounted FS, num_devices can't be zero unless it's a 2276 * seed. In case of a seed device being replaced, the replace 2277 * target added to the sprout FS, so there will be no more 2278 * device left under the seed FS. 2279 */ 2280 ASSERT(fs_devices->seeding); 2281 2282 list_del_init(&fs_devices->seed_list); 2283 close_fs_devices(fs_devices); 2284 free_fs_devices(fs_devices); 2285 } 2286 mutex_unlock(&uuid_mutex); 2287 } 2288 2289 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2290 { 2291 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2292 2293 mutex_lock(&fs_devices->device_list_mutex); 2294 2295 btrfs_sysfs_remove_device(tgtdev); 2296 2297 if (tgtdev->bdev) 2298 fs_devices->open_devices--; 2299 2300 fs_devices->num_devices--; 2301 2302 btrfs_assign_next_active_device(tgtdev, NULL); 2303 2304 list_del_rcu(&tgtdev->dev_list); 2305 2306 mutex_unlock(&fs_devices->device_list_mutex); 2307 2308 /* 2309 * The update_dev_time() with in btrfs_scratch_superblocks() 2310 * may lead to a call to btrfs_show_devname() which will try 2311 * to hold device_list_mutex. And here this device 2312 * is already out of device list, so we don't have to hold 2313 * the device_list_mutex lock. 2314 */ 2315 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2316 tgtdev->name->str); 2317 2318 btrfs_close_bdev(tgtdev); 2319 synchronize_rcu(); 2320 btrfs_free_device(tgtdev); 2321 } 2322 2323 static struct btrfs_device *btrfs_find_device_by_path( 2324 struct btrfs_fs_info *fs_info, const char *device_path) 2325 { 2326 int ret = 0; 2327 struct btrfs_super_block *disk_super; 2328 u64 devid; 2329 u8 *dev_uuid; 2330 struct block_device *bdev; 2331 struct btrfs_device *device; 2332 2333 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2334 fs_info->bdev_holder, 0, &bdev, &disk_super); 2335 if (ret) 2336 return ERR_PTR(ret); 2337 2338 devid = btrfs_stack_device_id(&disk_super->dev_item); 2339 dev_uuid = disk_super->dev_item.uuid; 2340 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2341 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2342 disk_super->metadata_uuid); 2343 else 2344 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2345 disk_super->fsid); 2346 2347 btrfs_release_disk_super(disk_super); 2348 if (!device) 2349 device = ERR_PTR(-ENOENT); 2350 blkdev_put(bdev, FMODE_READ); 2351 return device; 2352 } 2353 2354 /* 2355 * Lookup a device given by device id, or the path if the id is 0. 2356 */ 2357 struct btrfs_device *btrfs_find_device_by_devspec( 2358 struct btrfs_fs_info *fs_info, u64 devid, 2359 const char *device_path) 2360 { 2361 struct btrfs_device *device; 2362 2363 if (devid) { 2364 device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 2365 NULL); 2366 if (!device) 2367 return ERR_PTR(-ENOENT); 2368 return device; 2369 } 2370 2371 if (!device_path || !device_path[0]) 2372 return ERR_PTR(-EINVAL); 2373 2374 if (strcmp(device_path, "missing") == 0) { 2375 /* Find first missing device */ 2376 list_for_each_entry(device, &fs_info->fs_devices->devices, 2377 dev_list) { 2378 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2379 &device->dev_state) && !device->bdev) 2380 return device; 2381 } 2382 return ERR_PTR(-ENOENT); 2383 } 2384 2385 return btrfs_find_device_by_path(fs_info, device_path); 2386 } 2387 2388 /* 2389 * does all the dirty work required for changing file system's UUID. 2390 */ 2391 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2392 { 2393 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2394 struct btrfs_fs_devices *old_devices; 2395 struct btrfs_fs_devices *seed_devices; 2396 struct btrfs_super_block *disk_super = fs_info->super_copy; 2397 struct btrfs_device *device; 2398 u64 super_flags; 2399 2400 lockdep_assert_held(&uuid_mutex); 2401 if (!fs_devices->seeding) 2402 return -EINVAL; 2403 2404 /* 2405 * Private copy of the seed devices, anchored at 2406 * fs_info->fs_devices->seed_list 2407 */ 2408 seed_devices = alloc_fs_devices(NULL, NULL); 2409 if (IS_ERR(seed_devices)) 2410 return PTR_ERR(seed_devices); 2411 2412 /* 2413 * It's necessary to retain a copy of the original seed fs_devices in 2414 * fs_uuids so that filesystems which have been seeded can successfully 2415 * reference the seed device from open_seed_devices. This also supports 2416 * multiple fs seed. 2417 */ 2418 old_devices = clone_fs_devices(fs_devices); 2419 if (IS_ERR(old_devices)) { 2420 kfree(seed_devices); 2421 return PTR_ERR(old_devices); 2422 } 2423 2424 list_add(&old_devices->fs_list, &fs_uuids); 2425 2426 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2427 seed_devices->opened = 1; 2428 INIT_LIST_HEAD(&seed_devices->devices); 2429 INIT_LIST_HEAD(&seed_devices->alloc_list); 2430 mutex_init(&seed_devices->device_list_mutex); 2431 2432 mutex_lock(&fs_devices->device_list_mutex); 2433 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2434 synchronize_rcu); 2435 list_for_each_entry(device, &seed_devices->devices, dev_list) 2436 device->fs_devices = seed_devices; 2437 2438 fs_devices->seeding = false; 2439 fs_devices->num_devices = 0; 2440 fs_devices->open_devices = 0; 2441 fs_devices->missing_devices = 0; 2442 fs_devices->rotating = false; 2443 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2444 2445 generate_random_uuid(fs_devices->fsid); 2446 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2447 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2448 mutex_unlock(&fs_devices->device_list_mutex); 2449 2450 super_flags = btrfs_super_flags(disk_super) & 2451 ~BTRFS_SUPER_FLAG_SEEDING; 2452 btrfs_set_super_flags(disk_super, super_flags); 2453 2454 return 0; 2455 } 2456 2457 /* 2458 * Store the expected generation for seed devices in device items. 2459 */ 2460 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2461 { 2462 struct btrfs_fs_info *fs_info = trans->fs_info; 2463 struct btrfs_root *root = fs_info->chunk_root; 2464 struct btrfs_path *path; 2465 struct extent_buffer *leaf; 2466 struct btrfs_dev_item *dev_item; 2467 struct btrfs_device *device; 2468 struct btrfs_key key; 2469 u8 fs_uuid[BTRFS_FSID_SIZE]; 2470 u8 dev_uuid[BTRFS_UUID_SIZE]; 2471 u64 devid; 2472 int ret; 2473 2474 path = btrfs_alloc_path(); 2475 if (!path) 2476 return -ENOMEM; 2477 2478 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2479 key.offset = 0; 2480 key.type = BTRFS_DEV_ITEM_KEY; 2481 2482 while (1) { 2483 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2484 if (ret < 0) 2485 goto error; 2486 2487 leaf = path->nodes[0]; 2488 next_slot: 2489 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2490 ret = btrfs_next_leaf(root, path); 2491 if (ret > 0) 2492 break; 2493 if (ret < 0) 2494 goto error; 2495 leaf = path->nodes[0]; 2496 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2497 btrfs_release_path(path); 2498 continue; 2499 } 2500 2501 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2502 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2503 key.type != BTRFS_DEV_ITEM_KEY) 2504 break; 2505 2506 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2507 struct btrfs_dev_item); 2508 devid = btrfs_device_id(leaf, dev_item); 2509 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2510 BTRFS_UUID_SIZE); 2511 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2512 BTRFS_FSID_SIZE); 2513 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2514 fs_uuid); 2515 BUG_ON(!device); /* Logic error */ 2516 2517 if (device->fs_devices->seeding) { 2518 btrfs_set_device_generation(leaf, dev_item, 2519 device->generation); 2520 btrfs_mark_buffer_dirty(leaf); 2521 } 2522 2523 path->slots[0]++; 2524 goto next_slot; 2525 } 2526 ret = 0; 2527 error: 2528 btrfs_free_path(path); 2529 return ret; 2530 } 2531 2532 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2533 { 2534 struct btrfs_root *root = fs_info->dev_root; 2535 struct request_queue *q; 2536 struct btrfs_trans_handle *trans; 2537 struct btrfs_device *device; 2538 struct block_device *bdev; 2539 struct super_block *sb = fs_info->sb; 2540 struct rcu_string *name; 2541 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2542 u64 orig_super_total_bytes; 2543 u64 orig_super_num_devices; 2544 int seeding_dev = 0; 2545 int ret = 0; 2546 bool locked = false; 2547 2548 if (sb_rdonly(sb) && !fs_devices->seeding) 2549 return -EROFS; 2550 2551 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2552 fs_info->bdev_holder); 2553 if (IS_ERR(bdev)) 2554 return PTR_ERR(bdev); 2555 2556 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2557 ret = -EINVAL; 2558 goto error; 2559 } 2560 2561 if (fs_devices->seeding) { 2562 seeding_dev = 1; 2563 down_write(&sb->s_umount); 2564 mutex_lock(&uuid_mutex); 2565 locked = true; 2566 } 2567 2568 sync_blockdev(bdev); 2569 2570 rcu_read_lock(); 2571 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2572 if (device->bdev == bdev) { 2573 ret = -EEXIST; 2574 rcu_read_unlock(); 2575 goto error; 2576 } 2577 } 2578 rcu_read_unlock(); 2579 2580 device = btrfs_alloc_device(fs_info, NULL, NULL); 2581 if (IS_ERR(device)) { 2582 /* we can safely leave the fs_devices entry around */ 2583 ret = PTR_ERR(device); 2584 goto error; 2585 } 2586 2587 name = rcu_string_strdup(device_path, GFP_KERNEL); 2588 if (!name) { 2589 ret = -ENOMEM; 2590 goto error_free_device; 2591 } 2592 rcu_assign_pointer(device->name, name); 2593 2594 device->fs_info = fs_info; 2595 device->bdev = bdev; 2596 2597 ret = btrfs_get_dev_zone_info(device); 2598 if (ret) 2599 goto error_free_device; 2600 2601 trans = btrfs_start_transaction(root, 0); 2602 if (IS_ERR(trans)) { 2603 ret = PTR_ERR(trans); 2604 goto error_free_zone; 2605 } 2606 2607 q = bdev_get_queue(bdev); 2608 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2609 device->generation = trans->transid; 2610 device->io_width = fs_info->sectorsize; 2611 device->io_align = fs_info->sectorsize; 2612 device->sector_size = fs_info->sectorsize; 2613 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2614 fs_info->sectorsize); 2615 device->disk_total_bytes = device->total_bytes; 2616 device->commit_total_bytes = device->total_bytes; 2617 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2618 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2619 device->mode = FMODE_EXCL; 2620 device->dev_stats_valid = 1; 2621 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2622 2623 if (seeding_dev) { 2624 btrfs_clear_sb_rdonly(sb); 2625 ret = btrfs_prepare_sprout(fs_info); 2626 if (ret) { 2627 btrfs_abort_transaction(trans, ret); 2628 goto error_trans; 2629 } 2630 } 2631 2632 device->fs_devices = fs_devices; 2633 2634 mutex_lock(&fs_devices->device_list_mutex); 2635 mutex_lock(&fs_info->chunk_mutex); 2636 list_add_rcu(&device->dev_list, &fs_devices->devices); 2637 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2638 fs_devices->num_devices++; 2639 fs_devices->open_devices++; 2640 fs_devices->rw_devices++; 2641 fs_devices->total_devices++; 2642 fs_devices->total_rw_bytes += device->total_bytes; 2643 2644 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2645 2646 if (!blk_queue_nonrot(q)) 2647 fs_devices->rotating = true; 2648 2649 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2650 btrfs_set_super_total_bytes(fs_info->super_copy, 2651 round_down(orig_super_total_bytes + device->total_bytes, 2652 fs_info->sectorsize)); 2653 2654 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2655 btrfs_set_super_num_devices(fs_info->super_copy, 2656 orig_super_num_devices + 1); 2657 2658 /* 2659 * we've got more storage, clear any full flags on the space 2660 * infos 2661 */ 2662 btrfs_clear_space_info_full(fs_info); 2663 2664 mutex_unlock(&fs_info->chunk_mutex); 2665 2666 /* Add sysfs device entry */ 2667 btrfs_sysfs_add_device(device); 2668 2669 mutex_unlock(&fs_devices->device_list_mutex); 2670 2671 if (seeding_dev) { 2672 mutex_lock(&fs_info->chunk_mutex); 2673 ret = init_first_rw_device(trans); 2674 mutex_unlock(&fs_info->chunk_mutex); 2675 if (ret) { 2676 btrfs_abort_transaction(trans, ret); 2677 goto error_sysfs; 2678 } 2679 } 2680 2681 ret = btrfs_add_dev_item(trans, device); 2682 if (ret) { 2683 btrfs_abort_transaction(trans, ret); 2684 goto error_sysfs; 2685 } 2686 2687 if (seeding_dev) { 2688 ret = btrfs_finish_sprout(trans); 2689 if (ret) { 2690 btrfs_abort_transaction(trans, ret); 2691 goto error_sysfs; 2692 } 2693 2694 /* 2695 * fs_devices now represents the newly sprouted filesystem and 2696 * its fsid has been changed by btrfs_prepare_sprout 2697 */ 2698 btrfs_sysfs_update_sprout_fsid(fs_devices); 2699 } 2700 2701 ret = btrfs_commit_transaction(trans); 2702 2703 if (seeding_dev) { 2704 mutex_unlock(&uuid_mutex); 2705 up_write(&sb->s_umount); 2706 locked = false; 2707 2708 if (ret) /* transaction commit */ 2709 return ret; 2710 2711 ret = btrfs_relocate_sys_chunks(fs_info); 2712 if (ret < 0) 2713 btrfs_handle_fs_error(fs_info, ret, 2714 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2715 trans = btrfs_attach_transaction(root); 2716 if (IS_ERR(trans)) { 2717 if (PTR_ERR(trans) == -ENOENT) 2718 return 0; 2719 ret = PTR_ERR(trans); 2720 trans = NULL; 2721 goto error_sysfs; 2722 } 2723 ret = btrfs_commit_transaction(trans); 2724 } 2725 2726 /* 2727 * Now that we have written a new super block to this device, check all 2728 * other fs_devices list if device_path alienates any other scanned 2729 * device. 2730 * We can ignore the return value as it typically returns -EINVAL and 2731 * only succeeds if the device was an alien. 2732 */ 2733 btrfs_forget_devices(device_path); 2734 2735 /* Update ctime/mtime for blkid or udev */ 2736 update_dev_time(bdev); 2737 2738 return ret; 2739 2740 error_sysfs: 2741 btrfs_sysfs_remove_device(device); 2742 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2743 mutex_lock(&fs_info->chunk_mutex); 2744 list_del_rcu(&device->dev_list); 2745 list_del(&device->dev_alloc_list); 2746 fs_info->fs_devices->num_devices--; 2747 fs_info->fs_devices->open_devices--; 2748 fs_info->fs_devices->rw_devices--; 2749 fs_info->fs_devices->total_devices--; 2750 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2751 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2752 btrfs_set_super_total_bytes(fs_info->super_copy, 2753 orig_super_total_bytes); 2754 btrfs_set_super_num_devices(fs_info->super_copy, 2755 orig_super_num_devices); 2756 mutex_unlock(&fs_info->chunk_mutex); 2757 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2758 error_trans: 2759 if (seeding_dev) 2760 btrfs_set_sb_rdonly(sb); 2761 if (trans) 2762 btrfs_end_transaction(trans); 2763 error_free_zone: 2764 btrfs_destroy_dev_zone_info(device); 2765 error_free_device: 2766 btrfs_free_device(device); 2767 error: 2768 blkdev_put(bdev, FMODE_EXCL); 2769 if (locked) { 2770 mutex_unlock(&uuid_mutex); 2771 up_write(&sb->s_umount); 2772 } 2773 return ret; 2774 } 2775 2776 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2777 struct btrfs_device *device) 2778 { 2779 int ret; 2780 struct btrfs_path *path; 2781 struct btrfs_root *root = device->fs_info->chunk_root; 2782 struct btrfs_dev_item *dev_item; 2783 struct extent_buffer *leaf; 2784 struct btrfs_key key; 2785 2786 path = btrfs_alloc_path(); 2787 if (!path) 2788 return -ENOMEM; 2789 2790 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2791 key.type = BTRFS_DEV_ITEM_KEY; 2792 key.offset = device->devid; 2793 2794 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2795 if (ret < 0) 2796 goto out; 2797 2798 if (ret > 0) { 2799 ret = -ENOENT; 2800 goto out; 2801 } 2802 2803 leaf = path->nodes[0]; 2804 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2805 2806 btrfs_set_device_id(leaf, dev_item, device->devid); 2807 btrfs_set_device_type(leaf, dev_item, device->type); 2808 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2809 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2810 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2811 btrfs_set_device_total_bytes(leaf, dev_item, 2812 btrfs_device_get_disk_total_bytes(device)); 2813 btrfs_set_device_bytes_used(leaf, dev_item, 2814 btrfs_device_get_bytes_used(device)); 2815 btrfs_mark_buffer_dirty(leaf); 2816 2817 out: 2818 btrfs_free_path(path); 2819 return ret; 2820 } 2821 2822 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2823 struct btrfs_device *device, u64 new_size) 2824 { 2825 struct btrfs_fs_info *fs_info = device->fs_info; 2826 struct btrfs_super_block *super_copy = fs_info->super_copy; 2827 u64 old_total; 2828 u64 diff; 2829 2830 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2831 return -EACCES; 2832 2833 new_size = round_down(new_size, fs_info->sectorsize); 2834 2835 mutex_lock(&fs_info->chunk_mutex); 2836 old_total = btrfs_super_total_bytes(super_copy); 2837 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2838 2839 if (new_size <= device->total_bytes || 2840 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2841 mutex_unlock(&fs_info->chunk_mutex); 2842 return -EINVAL; 2843 } 2844 2845 btrfs_set_super_total_bytes(super_copy, 2846 round_down(old_total + diff, fs_info->sectorsize)); 2847 device->fs_devices->total_rw_bytes += diff; 2848 2849 btrfs_device_set_total_bytes(device, new_size); 2850 btrfs_device_set_disk_total_bytes(device, new_size); 2851 btrfs_clear_space_info_full(device->fs_info); 2852 if (list_empty(&device->post_commit_list)) 2853 list_add_tail(&device->post_commit_list, 2854 &trans->transaction->dev_update_list); 2855 mutex_unlock(&fs_info->chunk_mutex); 2856 2857 return btrfs_update_device(trans, device); 2858 } 2859 2860 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2861 { 2862 struct btrfs_fs_info *fs_info = trans->fs_info; 2863 struct btrfs_root *root = fs_info->chunk_root; 2864 int ret; 2865 struct btrfs_path *path; 2866 struct btrfs_key key; 2867 2868 path = btrfs_alloc_path(); 2869 if (!path) 2870 return -ENOMEM; 2871 2872 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2873 key.offset = chunk_offset; 2874 key.type = BTRFS_CHUNK_ITEM_KEY; 2875 2876 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2877 if (ret < 0) 2878 goto out; 2879 else if (ret > 0) { /* Logic error or corruption */ 2880 btrfs_handle_fs_error(fs_info, -ENOENT, 2881 "Failed lookup while freeing chunk."); 2882 ret = -ENOENT; 2883 goto out; 2884 } 2885 2886 ret = btrfs_del_item(trans, root, path); 2887 if (ret < 0) 2888 btrfs_handle_fs_error(fs_info, ret, 2889 "Failed to delete chunk item."); 2890 out: 2891 btrfs_free_path(path); 2892 return ret; 2893 } 2894 2895 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2896 { 2897 struct btrfs_super_block *super_copy = fs_info->super_copy; 2898 struct btrfs_disk_key *disk_key; 2899 struct btrfs_chunk *chunk; 2900 u8 *ptr; 2901 int ret = 0; 2902 u32 num_stripes; 2903 u32 array_size; 2904 u32 len = 0; 2905 u32 cur; 2906 struct btrfs_key key; 2907 2908 lockdep_assert_held(&fs_info->chunk_mutex); 2909 array_size = btrfs_super_sys_array_size(super_copy); 2910 2911 ptr = super_copy->sys_chunk_array; 2912 cur = 0; 2913 2914 while (cur < array_size) { 2915 disk_key = (struct btrfs_disk_key *)ptr; 2916 btrfs_disk_key_to_cpu(&key, disk_key); 2917 2918 len = sizeof(*disk_key); 2919 2920 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2921 chunk = (struct btrfs_chunk *)(ptr + len); 2922 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2923 len += btrfs_chunk_item_size(num_stripes); 2924 } else { 2925 ret = -EIO; 2926 break; 2927 } 2928 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2929 key.offset == chunk_offset) { 2930 memmove(ptr, ptr + len, array_size - (cur + len)); 2931 array_size -= len; 2932 btrfs_set_super_sys_array_size(super_copy, array_size); 2933 } else { 2934 ptr += len; 2935 cur += len; 2936 } 2937 } 2938 return ret; 2939 } 2940 2941 /* 2942 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 2943 * @logical: Logical block offset in bytes. 2944 * @length: Length of extent in bytes. 2945 * 2946 * Return: Chunk mapping or ERR_PTR. 2947 */ 2948 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2949 u64 logical, u64 length) 2950 { 2951 struct extent_map_tree *em_tree; 2952 struct extent_map *em; 2953 2954 em_tree = &fs_info->mapping_tree; 2955 read_lock(&em_tree->lock); 2956 em = lookup_extent_mapping(em_tree, logical, length); 2957 read_unlock(&em_tree->lock); 2958 2959 if (!em) { 2960 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2961 logical, length); 2962 return ERR_PTR(-EINVAL); 2963 } 2964 2965 if (em->start > logical || em->start + em->len < logical) { 2966 btrfs_crit(fs_info, 2967 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2968 logical, length, em->start, em->start + em->len); 2969 free_extent_map(em); 2970 return ERR_PTR(-EINVAL); 2971 } 2972 2973 /* callers are responsible for dropping em's ref. */ 2974 return em; 2975 } 2976 2977 static int remove_chunk_item(struct btrfs_trans_handle *trans, 2978 struct map_lookup *map, u64 chunk_offset) 2979 { 2980 int i; 2981 2982 /* 2983 * Removing chunk items and updating the device items in the chunks btree 2984 * requires holding the chunk_mutex. 2985 * See the comment at btrfs_chunk_alloc() for the details. 2986 */ 2987 lockdep_assert_held(&trans->fs_info->chunk_mutex); 2988 2989 for (i = 0; i < map->num_stripes; i++) { 2990 int ret; 2991 2992 ret = btrfs_update_device(trans, map->stripes[i].dev); 2993 if (ret) 2994 return ret; 2995 } 2996 2997 return btrfs_free_chunk(trans, chunk_offset); 2998 } 2999 3000 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3001 { 3002 struct btrfs_fs_info *fs_info = trans->fs_info; 3003 struct extent_map *em; 3004 struct map_lookup *map; 3005 u64 dev_extent_len = 0; 3006 int i, ret = 0; 3007 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3008 3009 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3010 if (IS_ERR(em)) { 3011 /* 3012 * This is a logic error, but we don't want to just rely on the 3013 * user having built with ASSERT enabled, so if ASSERT doesn't 3014 * do anything we still error out. 3015 */ 3016 ASSERT(0); 3017 return PTR_ERR(em); 3018 } 3019 map = em->map_lookup; 3020 3021 /* 3022 * First delete the device extent items from the devices btree. 3023 * We take the device_list_mutex to avoid racing with the finishing phase 3024 * of a device replace operation. See the comment below before acquiring 3025 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3026 * because that can result in a deadlock when deleting the device extent 3027 * items from the devices btree - COWing an extent buffer from the btree 3028 * may result in allocating a new metadata chunk, which would attempt to 3029 * lock again fs_info->chunk_mutex. 3030 */ 3031 mutex_lock(&fs_devices->device_list_mutex); 3032 for (i = 0; i < map->num_stripes; i++) { 3033 struct btrfs_device *device = map->stripes[i].dev; 3034 ret = btrfs_free_dev_extent(trans, device, 3035 map->stripes[i].physical, 3036 &dev_extent_len); 3037 if (ret) { 3038 mutex_unlock(&fs_devices->device_list_mutex); 3039 btrfs_abort_transaction(trans, ret); 3040 goto out; 3041 } 3042 3043 if (device->bytes_used > 0) { 3044 mutex_lock(&fs_info->chunk_mutex); 3045 btrfs_device_set_bytes_used(device, 3046 device->bytes_used - dev_extent_len); 3047 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3048 btrfs_clear_space_info_full(fs_info); 3049 mutex_unlock(&fs_info->chunk_mutex); 3050 } 3051 } 3052 mutex_unlock(&fs_devices->device_list_mutex); 3053 3054 /* 3055 * We acquire fs_info->chunk_mutex for 2 reasons: 3056 * 3057 * 1) Just like with the first phase of the chunk allocation, we must 3058 * reserve system space, do all chunk btree updates and deletions, and 3059 * update the system chunk array in the superblock while holding this 3060 * mutex. This is for similar reasons as explained on the comment at 3061 * the top of btrfs_chunk_alloc(); 3062 * 3063 * 2) Prevent races with the final phase of a device replace operation 3064 * that replaces the device object associated with the map's stripes, 3065 * because the device object's id can change at any time during that 3066 * final phase of the device replace operation 3067 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3068 * replaced device and then see it with an ID of 3069 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3070 * the device item, which does not exists on the chunk btree. 3071 * The finishing phase of device replace acquires both the 3072 * device_list_mutex and the chunk_mutex, in that order, so we are 3073 * safe by just acquiring the chunk_mutex. 3074 */ 3075 trans->removing_chunk = true; 3076 mutex_lock(&fs_info->chunk_mutex); 3077 3078 check_system_chunk(trans, map->type); 3079 3080 ret = remove_chunk_item(trans, map, chunk_offset); 3081 /* 3082 * Normally we should not get -ENOSPC since we reserved space before 3083 * through the call to check_system_chunk(). 3084 * 3085 * Despite our system space_info having enough free space, we may not 3086 * be able to allocate extents from its block groups, because all have 3087 * an incompatible profile, which will force us to allocate a new system 3088 * block group with the right profile, or right after we called 3089 * check_system_space() above, a scrub turned the only system block group 3090 * with enough free space into RO mode. 3091 * This is explained with more detail at do_chunk_alloc(). 3092 * 3093 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3094 */ 3095 if (ret == -ENOSPC) { 3096 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3097 struct btrfs_block_group *sys_bg; 3098 3099 sys_bg = btrfs_alloc_chunk(trans, sys_flags); 3100 if (IS_ERR(sys_bg)) { 3101 ret = PTR_ERR(sys_bg); 3102 btrfs_abort_transaction(trans, ret); 3103 goto out; 3104 } 3105 3106 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3107 if (ret) { 3108 btrfs_abort_transaction(trans, ret); 3109 goto out; 3110 } 3111 3112 ret = remove_chunk_item(trans, map, chunk_offset); 3113 if (ret) { 3114 btrfs_abort_transaction(trans, ret); 3115 goto out; 3116 } 3117 } else if (ret) { 3118 btrfs_abort_transaction(trans, ret); 3119 goto out; 3120 } 3121 3122 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3123 3124 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3125 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3126 if (ret) { 3127 btrfs_abort_transaction(trans, ret); 3128 goto out; 3129 } 3130 } 3131 3132 mutex_unlock(&fs_info->chunk_mutex); 3133 trans->removing_chunk = false; 3134 3135 /* 3136 * We are done with chunk btree updates and deletions, so release the 3137 * system space we previously reserved (with check_system_chunk()). 3138 */ 3139 btrfs_trans_release_chunk_metadata(trans); 3140 3141 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3142 if (ret) { 3143 btrfs_abort_transaction(trans, ret); 3144 goto out; 3145 } 3146 3147 out: 3148 if (trans->removing_chunk) { 3149 mutex_unlock(&fs_info->chunk_mutex); 3150 trans->removing_chunk = false; 3151 } 3152 /* once for us */ 3153 free_extent_map(em); 3154 return ret; 3155 } 3156 3157 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3158 { 3159 struct btrfs_root *root = fs_info->chunk_root; 3160 struct btrfs_trans_handle *trans; 3161 struct btrfs_block_group *block_group; 3162 u64 length; 3163 int ret; 3164 3165 /* 3166 * Prevent races with automatic removal of unused block groups. 3167 * After we relocate and before we remove the chunk with offset 3168 * chunk_offset, automatic removal of the block group can kick in, 3169 * resulting in a failure when calling btrfs_remove_chunk() below. 3170 * 3171 * Make sure to acquire this mutex before doing a tree search (dev 3172 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3173 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3174 * we release the path used to search the chunk/dev tree and before 3175 * the current task acquires this mutex and calls us. 3176 */ 3177 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3178 3179 /* step one, relocate all the extents inside this chunk */ 3180 btrfs_scrub_pause(fs_info); 3181 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3182 btrfs_scrub_continue(fs_info); 3183 if (ret) 3184 return ret; 3185 3186 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3187 if (!block_group) 3188 return -ENOENT; 3189 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3190 length = block_group->length; 3191 btrfs_put_block_group(block_group); 3192 3193 /* 3194 * On a zoned file system, discard the whole block group, this will 3195 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3196 * resetting the zone fails, don't treat it as a fatal problem from the 3197 * filesystem's point of view. 3198 */ 3199 if (btrfs_is_zoned(fs_info)) { 3200 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3201 if (ret) 3202 btrfs_info(fs_info, 3203 "failed to reset zone %llu after relocation", 3204 chunk_offset); 3205 } 3206 3207 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3208 chunk_offset); 3209 if (IS_ERR(trans)) { 3210 ret = PTR_ERR(trans); 3211 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3212 return ret; 3213 } 3214 3215 /* 3216 * step two, delete the device extents and the 3217 * chunk tree entries 3218 */ 3219 ret = btrfs_remove_chunk(trans, chunk_offset); 3220 btrfs_end_transaction(trans); 3221 return ret; 3222 } 3223 3224 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3225 { 3226 struct btrfs_root *chunk_root = fs_info->chunk_root; 3227 struct btrfs_path *path; 3228 struct extent_buffer *leaf; 3229 struct btrfs_chunk *chunk; 3230 struct btrfs_key key; 3231 struct btrfs_key found_key; 3232 u64 chunk_type; 3233 bool retried = false; 3234 int failed = 0; 3235 int ret; 3236 3237 path = btrfs_alloc_path(); 3238 if (!path) 3239 return -ENOMEM; 3240 3241 again: 3242 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3243 key.offset = (u64)-1; 3244 key.type = BTRFS_CHUNK_ITEM_KEY; 3245 3246 while (1) { 3247 mutex_lock(&fs_info->reclaim_bgs_lock); 3248 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3249 if (ret < 0) { 3250 mutex_unlock(&fs_info->reclaim_bgs_lock); 3251 goto error; 3252 } 3253 BUG_ON(ret == 0); /* Corruption */ 3254 3255 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3256 key.type); 3257 if (ret) 3258 mutex_unlock(&fs_info->reclaim_bgs_lock); 3259 if (ret < 0) 3260 goto error; 3261 if (ret > 0) 3262 break; 3263 3264 leaf = path->nodes[0]; 3265 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3266 3267 chunk = btrfs_item_ptr(leaf, path->slots[0], 3268 struct btrfs_chunk); 3269 chunk_type = btrfs_chunk_type(leaf, chunk); 3270 btrfs_release_path(path); 3271 3272 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3273 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3274 if (ret == -ENOSPC) 3275 failed++; 3276 else 3277 BUG_ON(ret); 3278 } 3279 mutex_unlock(&fs_info->reclaim_bgs_lock); 3280 3281 if (found_key.offset == 0) 3282 break; 3283 key.offset = found_key.offset - 1; 3284 } 3285 ret = 0; 3286 if (failed && !retried) { 3287 failed = 0; 3288 retried = true; 3289 goto again; 3290 } else if (WARN_ON(failed && retried)) { 3291 ret = -ENOSPC; 3292 } 3293 error: 3294 btrfs_free_path(path); 3295 return ret; 3296 } 3297 3298 /* 3299 * return 1 : allocate a data chunk successfully, 3300 * return <0: errors during allocating a data chunk, 3301 * return 0 : no need to allocate a data chunk. 3302 */ 3303 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3304 u64 chunk_offset) 3305 { 3306 struct btrfs_block_group *cache; 3307 u64 bytes_used; 3308 u64 chunk_type; 3309 3310 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3311 ASSERT(cache); 3312 chunk_type = cache->flags; 3313 btrfs_put_block_group(cache); 3314 3315 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3316 return 0; 3317 3318 spin_lock(&fs_info->data_sinfo->lock); 3319 bytes_used = fs_info->data_sinfo->bytes_used; 3320 spin_unlock(&fs_info->data_sinfo->lock); 3321 3322 if (!bytes_used) { 3323 struct btrfs_trans_handle *trans; 3324 int ret; 3325 3326 trans = btrfs_join_transaction(fs_info->tree_root); 3327 if (IS_ERR(trans)) 3328 return PTR_ERR(trans); 3329 3330 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3331 btrfs_end_transaction(trans); 3332 if (ret < 0) 3333 return ret; 3334 return 1; 3335 } 3336 3337 return 0; 3338 } 3339 3340 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3341 struct btrfs_balance_control *bctl) 3342 { 3343 struct btrfs_root *root = fs_info->tree_root; 3344 struct btrfs_trans_handle *trans; 3345 struct btrfs_balance_item *item; 3346 struct btrfs_disk_balance_args disk_bargs; 3347 struct btrfs_path *path; 3348 struct extent_buffer *leaf; 3349 struct btrfs_key key; 3350 int ret, err; 3351 3352 path = btrfs_alloc_path(); 3353 if (!path) 3354 return -ENOMEM; 3355 3356 trans = btrfs_start_transaction(root, 0); 3357 if (IS_ERR(trans)) { 3358 btrfs_free_path(path); 3359 return PTR_ERR(trans); 3360 } 3361 3362 key.objectid = BTRFS_BALANCE_OBJECTID; 3363 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3364 key.offset = 0; 3365 3366 ret = btrfs_insert_empty_item(trans, root, path, &key, 3367 sizeof(*item)); 3368 if (ret) 3369 goto out; 3370 3371 leaf = path->nodes[0]; 3372 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3373 3374 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3375 3376 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3377 btrfs_set_balance_data(leaf, item, &disk_bargs); 3378 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3379 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3380 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3381 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3382 3383 btrfs_set_balance_flags(leaf, item, bctl->flags); 3384 3385 btrfs_mark_buffer_dirty(leaf); 3386 out: 3387 btrfs_free_path(path); 3388 err = btrfs_commit_transaction(trans); 3389 if (err && !ret) 3390 ret = err; 3391 return ret; 3392 } 3393 3394 static int del_balance_item(struct btrfs_fs_info *fs_info) 3395 { 3396 struct btrfs_root *root = fs_info->tree_root; 3397 struct btrfs_trans_handle *trans; 3398 struct btrfs_path *path; 3399 struct btrfs_key key; 3400 int ret, err; 3401 3402 path = btrfs_alloc_path(); 3403 if (!path) 3404 return -ENOMEM; 3405 3406 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3407 if (IS_ERR(trans)) { 3408 btrfs_free_path(path); 3409 return PTR_ERR(trans); 3410 } 3411 3412 key.objectid = BTRFS_BALANCE_OBJECTID; 3413 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3414 key.offset = 0; 3415 3416 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3417 if (ret < 0) 3418 goto out; 3419 if (ret > 0) { 3420 ret = -ENOENT; 3421 goto out; 3422 } 3423 3424 ret = btrfs_del_item(trans, root, path); 3425 out: 3426 btrfs_free_path(path); 3427 err = btrfs_commit_transaction(trans); 3428 if (err && !ret) 3429 ret = err; 3430 return ret; 3431 } 3432 3433 /* 3434 * This is a heuristic used to reduce the number of chunks balanced on 3435 * resume after balance was interrupted. 3436 */ 3437 static void update_balance_args(struct btrfs_balance_control *bctl) 3438 { 3439 /* 3440 * Turn on soft mode for chunk types that were being converted. 3441 */ 3442 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3443 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3444 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3445 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3446 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3447 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3448 3449 /* 3450 * Turn on usage filter if is not already used. The idea is 3451 * that chunks that we have already balanced should be 3452 * reasonably full. Don't do it for chunks that are being 3453 * converted - that will keep us from relocating unconverted 3454 * (albeit full) chunks. 3455 */ 3456 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3457 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3458 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3459 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3460 bctl->data.usage = 90; 3461 } 3462 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3463 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3464 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3465 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3466 bctl->sys.usage = 90; 3467 } 3468 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3469 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3470 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3471 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3472 bctl->meta.usage = 90; 3473 } 3474 } 3475 3476 /* 3477 * Clear the balance status in fs_info and delete the balance item from disk. 3478 */ 3479 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3480 { 3481 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3482 int ret; 3483 3484 BUG_ON(!fs_info->balance_ctl); 3485 3486 spin_lock(&fs_info->balance_lock); 3487 fs_info->balance_ctl = NULL; 3488 spin_unlock(&fs_info->balance_lock); 3489 3490 kfree(bctl); 3491 ret = del_balance_item(fs_info); 3492 if (ret) 3493 btrfs_handle_fs_error(fs_info, ret, NULL); 3494 } 3495 3496 /* 3497 * Balance filters. Return 1 if chunk should be filtered out 3498 * (should not be balanced). 3499 */ 3500 static int chunk_profiles_filter(u64 chunk_type, 3501 struct btrfs_balance_args *bargs) 3502 { 3503 chunk_type = chunk_to_extended(chunk_type) & 3504 BTRFS_EXTENDED_PROFILE_MASK; 3505 3506 if (bargs->profiles & chunk_type) 3507 return 0; 3508 3509 return 1; 3510 } 3511 3512 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3513 struct btrfs_balance_args *bargs) 3514 { 3515 struct btrfs_block_group *cache; 3516 u64 chunk_used; 3517 u64 user_thresh_min; 3518 u64 user_thresh_max; 3519 int ret = 1; 3520 3521 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3522 chunk_used = cache->used; 3523 3524 if (bargs->usage_min == 0) 3525 user_thresh_min = 0; 3526 else 3527 user_thresh_min = div_factor_fine(cache->length, 3528 bargs->usage_min); 3529 3530 if (bargs->usage_max == 0) 3531 user_thresh_max = 1; 3532 else if (bargs->usage_max > 100) 3533 user_thresh_max = cache->length; 3534 else 3535 user_thresh_max = div_factor_fine(cache->length, 3536 bargs->usage_max); 3537 3538 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3539 ret = 0; 3540 3541 btrfs_put_block_group(cache); 3542 return ret; 3543 } 3544 3545 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3546 u64 chunk_offset, struct btrfs_balance_args *bargs) 3547 { 3548 struct btrfs_block_group *cache; 3549 u64 chunk_used, user_thresh; 3550 int ret = 1; 3551 3552 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3553 chunk_used = cache->used; 3554 3555 if (bargs->usage_min == 0) 3556 user_thresh = 1; 3557 else if (bargs->usage > 100) 3558 user_thresh = cache->length; 3559 else 3560 user_thresh = div_factor_fine(cache->length, bargs->usage); 3561 3562 if (chunk_used < user_thresh) 3563 ret = 0; 3564 3565 btrfs_put_block_group(cache); 3566 return ret; 3567 } 3568 3569 static int chunk_devid_filter(struct extent_buffer *leaf, 3570 struct btrfs_chunk *chunk, 3571 struct btrfs_balance_args *bargs) 3572 { 3573 struct btrfs_stripe *stripe; 3574 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3575 int i; 3576 3577 for (i = 0; i < num_stripes; i++) { 3578 stripe = btrfs_stripe_nr(chunk, i); 3579 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3580 return 0; 3581 } 3582 3583 return 1; 3584 } 3585 3586 static u64 calc_data_stripes(u64 type, int num_stripes) 3587 { 3588 const int index = btrfs_bg_flags_to_raid_index(type); 3589 const int ncopies = btrfs_raid_array[index].ncopies; 3590 const int nparity = btrfs_raid_array[index].nparity; 3591 3592 return (num_stripes - nparity) / ncopies; 3593 } 3594 3595 /* [pstart, pend) */ 3596 static int chunk_drange_filter(struct extent_buffer *leaf, 3597 struct btrfs_chunk *chunk, 3598 struct btrfs_balance_args *bargs) 3599 { 3600 struct btrfs_stripe *stripe; 3601 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3602 u64 stripe_offset; 3603 u64 stripe_length; 3604 u64 type; 3605 int factor; 3606 int i; 3607 3608 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3609 return 0; 3610 3611 type = btrfs_chunk_type(leaf, chunk); 3612 factor = calc_data_stripes(type, num_stripes); 3613 3614 for (i = 0; i < num_stripes; i++) { 3615 stripe = btrfs_stripe_nr(chunk, i); 3616 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3617 continue; 3618 3619 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3620 stripe_length = btrfs_chunk_length(leaf, chunk); 3621 stripe_length = div_u64(stripe_length, factor); 3622 3623 if (stripe_offset < bargs->pend && 3624 stripe_offset + stripe_length > bargs->pstart) 3625 return 0; 3626 } 3627 3628 return 1; 3629 } 3630 3631 /* [vstart, vend) */ 3632 static int chunk_vrange_filter(struct extent_buffer *leaf, 3633 struct btrfs_chunk *chunk, 3634 u64 chunk_offset, 3635 struct btrfs_balance_args *bargs) 3636 { 3637 if (chunk_offset < bargs->vend && 3638 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3639 /* at least part of the chunk is inside this vrange */ 3640 return 0; 3641 3642 return 1; 3643 } 3644 3645 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3646 struct btrfs_chunk *chunk, 3647 struct btrfs_balance_args *bargs) 3648 { 3649 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3650 3651 if (bargs->stripes_min <= num_stripes 3652 && num_stripes <= bargs->stripes_max) 3653 return 0; 3654 3655 return 1; 3656 } 3657 3658 static int chunk_soft_convert_filter(u64 chunk_type, 3659 struct btrfs_balance_args *bargs) 3660 { 3661 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3662 return 0; 3663 3664 chunk_type = chunk_to_extended(chunk_type) & 3665 BTRFS_EXTENDED_PROFILE_MASK; 3666 3667 if (bargs->target == chunk_type) 3668 return 1; 3669 3670 return 0; 3671 } 3672 3673 static int should_balance_chunk(struct extent_buffer *leaf, 3674 struct btrfs_chunk *chunk, u64 chunk_offset) 3675 { 3676 struct btrfs_fs_info *fs_info = leaf->fs_info; 3677 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3678 struct btrfs_balance_args *bargs = NULL; 3679 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3680 3681 /* type filter */ 3682 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3683 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3684 return 0; 3685 } 3686 3687 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3688 bargs = &bctl->data; 3689 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3690 bargs = &bctl->sys; 3691 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3692 bargs = &bctl->meta; 3693 3694 /* profiles filter */ 3695 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3696 chunk_profiles_filter(chunk_type, bargs)) { 3697 return 0; 3698 } 3699 3700 /* usage filter */ 3701 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3702 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3703 return 0; 3704 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3705 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3706 return 0; 3707 } 3708 3709 /* devid filter */ 3710 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3711 chunk_devid_filter(leaf, chunk, bargs)) { 3712 return 0; 3713 } 3714 3715 /* drange filter, makes sense only with devid filter */ 3716 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3717 chunk_drange_filter(leaf, chunk, bargs)) { 3718 return 0; 3719 } 3720 3721 /* vrange filter */ 3722 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3723 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3724 return 0; 3725 } 3726 3727 /* stripes filter */ 3728 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3729 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3730 return 0; 3731 } 3732 3733 /* soft profile changing mode */ 3734 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3735 chunk_soft_convert_filter(chunk_type, bargs)) { 3736 return 0; 3737 } 3738 3739 /* 3740 * limited by count, must be the last filter 3741 */ 3742 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3743 if (bargs->limit == 0) 3744 return 0; 3745 else 3746 bargs->limit--; 3747 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3748 /* 3749 * Same logic as the 'limit' filter; the minimum cannot be 3750 * determined here because we do not have the global information 3751 * about the count of all chunks that satisfy the filters. 3752 */ 3753 if (bargs->limit_max == 0) 3754 return 0; 3755 else 3756 bargs->limit_max--; 3757 } 3758 3759 return 1; 3760 } 3761 3762 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3763 { 3764 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3765 struct btrfs_root *chunk_root = fs_info->chunk_root; 3766 u64 chunk_type; 3767 struct btrfs_chunk *chunk; 3768 struct btrfs_path *path = NULL; 3769 struct btrfs_key key; 3770 struct btrfs_key found_key; 3771 struct extent_buffer *leaf; 3772 int slot; 3773 int ret; 3774 int enospc_errors = 0; 3775 bool counting = true; 3776 /* The single value limit and min/max limits use the same bytes in the */ 3777 u64 limit_data = bctl->data.limit; 3778 u64 limit_meta = bctl->meta.limit; 3779 u64 limit_sys = bctl->sys.limit; 3780 u32 count_data = 0; 3781 u32 count_meta = 0; 3782 u32 count_sys = 0; 3783 int chunk_reserved = 0; 3784 3785 path = btrfs_alloc_path(); 3786 if (!path) { 3787 ret = -ENOMEM; 3788 goto error; 3789 } 3790 3791 /* zero out stat counters */ 3792 spin_lock(&fs_info->balance_lock); 3793 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3794 spin_unlock(&fs_info->balance_lock); 3795 again: 3796 if (!counting) { 3797 /* 3798 * The single value limit and min/max limits use the same bytes 3799 * in the 3800 */ 3801 bctl->data.limit = limit_data; 3802 bctl->meta.limit = limit_meta; 3803 bctl->sys.limit = limit_sys; 3804 } 3805 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3806 key.offset = (u64)-1; 3807 key.type = BTRFS_CHUNK_ITEM_KEY; 3808 3809 while (1) { 3810 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3811 atomic_read(&fs_info->balance_cancel_req)) { 3812 ret = -ECANCELED; 3813 goto error; 3814 } 3815 3816 mutex_lock(&fs_info->reclaim_bgs_lock); 3817 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3818 if (ret < 0) { 3819 mutex_unlock(&fs_info->reclaim_bgs_lock); 3820 goto error; 3821 } 3822 3823 /* 3824 * this shouldn't happen, it means the last relocate 3825 * failed 3826 */ 3827 if (ret == 0) 3828 BUG(); /* FIXME break ? */ 3829 3830 ret = btrfs_previous_item(chunk_root, path, 0, 3831 BTRFS_CHUNK_ITEM_KEY); 3832 if (ret) { 3833 mutex_unlock(&fs_info->reclaim_bgs_lock); 3834 ret = 0; 3835 break; 3836 } 3837 3838 leaf = path->nodes[0]; 3839 slot = path->slots[0]; 3840 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3841 3842 if (found_key.objectid != key.objectid) { 3843 mutex_unlock(&fs_info->reclaim_bgs_lock); 3844 break; 3845 } 3846 3847 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3848 chunk_type = btrfs_chunk_type(leaf, chunk); 3849 3850 if (!counting) { 3851 spin_lock(&fs_info->balance_lock); 3852 bctl->stat.considered++; 3853 spin_unlock(&fs_info->balance_lock); 3854 } 3855 3856 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3857 3858 btrfs_release_path(path); 3859 if (!ret) { 3860 mutex_unlock(&fs_info->reclaim_bgs_lock); 3861 goto loop; 3862 } 3863 3864 if (counting) { 3865 mutex_unlock(&fs_info->reclaim_bgs_lock); 3866 spin_lock(&fs_info->balance_lock); 3867 bctl->stat.expected++; 3868 spin_unlock(&fs_info->balance_lock); 3869 3870 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3871 count_data++; 3872 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3873 count_sys++; 3874 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3875 count_meta++; 3876 3877 goto loop; 3878 } 3879 3880 /* 3881 * Apply limit_min filter, no need to check if the LIMITS 3882 * filter is used, limit_min is 0 by default 3883 */ 3884 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3885 count_data < bctl->data.limit_min) 3886 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3887 count_meta < bctl->meta.limit_min) 3888 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3889 count_sys < bctl->sys.limit_min)) { 3890 mutex_unlock(&fs_info->reclaim_bgs_lock); 3891 goto loop; 3892 } 3893 3894 if (!chunk_reserved) { 3895 /* 3896 * We may be relocating the only data chunk we have, 3897 * which could potentially end up with losing data's 3898 * raid profile, so lets allocate an empty one in 3899 * advance. 3900 */ 3901 ret = btrfs_may_alloc_data_chunk(fs_info, 3902 found_key.offset); 3903 if (ret < 0) { 3904 mutex_unlock(&fs_info->reclaim_bgs_lock); 3905 goto error; 3906 } else if (ret == 1) { 3907 chunk_reserved = 1; 3908 } 3909 } 3910 3911 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3912 mutex_unlock(&fs_info->reclaim_bgs_lock); 3913 if (ret == -ENOSPC) { 3914 enospc_errors++; 3915 } else if (ret == -ETXTBSY) { 3916 btrfs_info(fs_info, 3917 "skipping relocation of block group %llu due to active swapfile", 3918 found_key.offset); 3919 ret = 0; 3920 } else if (ret) { 3921 goto error; 3922 } else { 3923 spin_lock(&fs_info->balance_lock); 3924 bctl->stat.completed++; 3925 spin_unlock(&fs_info->balance_lock); 3926 } 3927 loop: 3928 if (found_key.offset == 0) 3929 break; 3930 key.offset = found_key.offset - 1; 3931 } 3932 3933 if (counting) { 3934 btrfs_release_path(path); 3935 counting = false; 3936 goto again; 3937 } 3938 error: 3939 btrfs_free_path(path); 3940 if (enospc_errors) { 3941 btrfs_info(fs_info, "%d enospc errors during balance", 3942 enospc_errors); 3943 if (!ret) 3944 ret = -ENOSPC; 3945 } 3946 3947 return ret; 3948 } 3949 3950 /** 3951 * alloc_profile_is_valid - see if a given profile is valid and reduced 3952 * @flags: profile to validate 3953 * @extended: if true @flags is treated as an extended profile 3954 */ 3955 static int alloc_profile_is_valid(u64 flags, int extended) 3956 { 3957 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3958 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3959 3960 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3961 3962 /* 1) check that all other bits are zeroed */ 3963 if (flags & ~mask) 3964 return 0; 3965 3966 /* 2) see if profile is reduced */ 3967 if (flags == 0) 3968 return !extended; /* "0" is valid for usual profiles */ 3969 3970 return has_single_bit_set(flags); 3971 } 3972 3973 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3974 { 3975 /* cancel requested || normal exit path */ 3976 return atomic_read(&fs_info->balance_cancel_req) || 3977 (atomic_read(&fs_info->balance_pause_req) == 0 && 3978 atomic_read(&fs_info->balance_cancel_req) == 0); 3979 } 3980 3981 /* 3982 * Validate target profile against allowed profiles and return true if it's OK. 3983 * Otherwise print the error message and return false. 3984 */ 3985 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 3986 const struct btrfs_balance_args *bargs, 3987 u64 allowed, const char *type) 3988 { 3989 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3990 return true; 3991 3992 if (fs_info->sectorsize < PAGE_SIZE && 3993 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3994 btrfs_err(fs_info, 3995 "RAID56 is not yet supported for sectorsize %u with page size %lu", 3996 fs_info->sectorsize, PAGE_SIZE); 3997 return false; 3998 } 3999 /* Profile is valid and does not have bits outside of the allowed set */ 4000 if (alloc_profile_is_valid(bargs->target, 1) && 4001 (bargs->target & ~allowed) == 0) 4002 return true; 4003 4004 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4005 type, btrfs_bg_type_to_raid_name(bargs->target)); 4006 return false; 4007 } 4008 4009 /* 4010 * Fill @buf with textual description of balance filter flags @bargs, up to 4011 * @size_buf including the terminating null. The output may be trimmed if it 4012 * does not fit into the provided buffer. 4013 */ 4014 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4015 u32 size_buf) 4016 { 4017 int ret; 4018 u32 size_bp = size_buf; 4019 char *bp = buf; 4020 u64 flags = bargs->flags; 4021 char tmp_buf[128] = {'\0'}; 4022 4023 if (!flags) 4024 return; 4025 4026 #define CHECK_APPEND_NOARG(a) \ 4027 do { \ 4028 ret = snprintf(bp, size_bp, (a)); \ 4029 if (ret < 0 || ret >= size_bp) \ 4030 goto out_overflow; \ 4031 size_bp -= ret; \ 4032 bp += ret; \ 4033 } while (0) 4034 4035 #define CHECK_APPEND_1ARG(a, v1) \ 4036 do { \ 4037 ret = snprintf(bp, size_bp, (a), (v1)); \ 4038 if (ret < 0 || ret >= size_bp) \ 4039 goto out_overflow; \ 4040 size_bp -= ret; \ 4041 bp += ret; \ 4042 } while (0) 4043 4044 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4045 do { \ 4046 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4047 if (ret < 0 || ret >= size_bp) \ 4048 goto out_overflow; \ 4049 size_bp -= ret; \ 4050 bp += ret; \ 4051 } while (0) 4052 4053 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4054 CHECK_APPEND_1ARG("convert=%s,", 4055 btrfs_bg_type_to_raid_name(bargs->target)); 4056 4057 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4058 CHECK_APPEND_NOARG("soft,"); 4059 4060 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4061 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4062 sizeof(tmp_buf)); 4063 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4064 } 4065 4066 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4067 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4068 4069 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4070 CHECK_APPEND_2ARG("usage=%u..%u,", 4071 bargs->usage_min, bargs->usage_max); 4072 4073 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4074 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4075 4076 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4077 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4078 bargs->pstart, bargs->pend); 4079 4080 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4081 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4082 bargs->vstart, bargs->vend); 4083 4084 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4085 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4086 4087 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4088 CHECK_APPEND_2ARG("limit=%u..%u,", 4089 bargs->limit_min, bargs->limit_max); 4090 4091 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4092 CHECK_APPEND_2ARG("stripes=%u..%u,", 4093 bargs->stripes_min, bargs->stripes_max); 4094 4095 #undef CHECK_APPEND_2ARG 4096 #undef CHECK_APPEND_1ARG 4097 #undef CHECK_APPEND_NOARG 4098 4099 out_overflow: 4100 4101 if (size_bp < size_buf) 4102 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4103 else 4104 buf[0] = '\0'; 4105 } 4106 4107 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4108 { 4109 u32 size_buf = 1024; 4110 char tmp_buf[192] = {'\0'}; 4111 char *buf; 4112 char *bp; 4113 u32 size_bp = size_buf; 4114 int ret; 4115 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4116 4117 buf = kzalloc(size_buf, GFP_KERNEL); 4118 if (!buf) 4119 return; 4120 4121 bp = buf; 4122 4123 #define CHECK_APPEND_1ARG(a, v1) \ 4124 do { \ 4125 ret = snprintf(bp, size_bp, (a), (v1)); \ 4126 if (ret < 0 || ret >= size_bp) \ 4127 goto out_overflow; \ 4128 size_bp -= ret; \ 4129 bp += ret; \ 4130 } while (0) 4131 4132 if (bctl->flags & BTRFS_BALANCE_FORCE) 4133 CHECK_APPEND_1ARG("%s", "-f "); 4134 4135 if (bctl->flags & BTRFS_BALANCE_DATA) { 4136 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4137 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4138 } 4139 4140 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4141 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4142 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4143 } 4144 4145 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4146 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4147 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4148 } 4149 4150 #undef CHECK_APPEND_1ARG 4151 4152 out_overflow: 4153 4154 if (size_bp < size_buf) 4155 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4156 btrfs_info(fs_info, "balance: %s %s", 4157 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4158 "resume" : "start", buf); 4159 4160 kfree(buf); 4161 } 4162 4163 /* 4164 * Should be called with balance mutexe held 4165 */ 4166 int btrfs_balance(struct btrfs_fs_info *fs_info, 4167 struct btrfs_balance_control *bctl, 4168 struct btrfs_ioctl_balance_args *bargs) 4169 { 4170 u64 meta_target, data_target; 4171 u64 allowed; 4172 int mixed = 0; 4173 int ret; 4174 u64 num_devices; 4175 unsigned seq; 4176 bool reducing_redundancy; 4177 int i; 4178 4179 if (btrfs_fs_closing(fs_info) || 4180 atomic_read(&fs_info->balance_pause_req) || 4181 btrfs_should_cancel_balance(fs_info)) { 4182 ret = -EINVAL; 4183 goto out; 4184 } 4185 4186 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4187 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4188 mixed = 1; 4189 4190 /* 4191 * In case of mixed groups both data and meta should be picked, 4192 * and identical options should be given for both of them. 4193 */ 4194 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4195 if (mixed && (bctl->flags & allowed)) { 4196 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4197 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4198 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4199 btrfs_err(fs_info, 4200 "balance: mixed groups data and metadata options must be the same"); 4201 ret = -EINVAL; 4202 goto out; 4203 } 4204 } 4205 4206 /* 4207 * rw_devices will not change at the moment, device add/delete/replace 4208 * are exclusive 4209 */ 4210 num_devices = fs_info->fs_devices->rw_devices; 4211 4212 /* 4213 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4214 * special bit for it, to make it easier to distinguish. Thus we need 4215 * to set it manually, or balance would refuse the profile. 4216 */ 4217 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4218 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4219 if (num_devices >= btrfs_raid_array[i].devs_min) 4220 allowed |= btrfs_raid_array[i].bg_flag; 4221 4222 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4223 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4224 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4225 ret = -EINVAL; 4226 goto out; 4227 } 4228 4229 /* 4230 * Allow to reduce metadata or system integrity only if force set for 4231 * profiles with redundancy (copies, parity) 4232 */ 4233 allowed = 0; 4234 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4235 if (btrfs_raid_array[i].ncopies >= 2 || 4236 btrfs_raid_array[i].tolerated_failures >= 1) 4237 allowed |= btrfs_raid_array[i].bg_flag; 4238 } 4239 do { 4240 seq = read_seqbegin(&fs_info->profiles_lock); 4241 4242 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4243 (fs_info->avail_system_alloc_bits & allowed) && 4244 !(bctl->sys.target & allowed)) || 4245 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4246 (fs_info->avail_metadata_alloc_bits & allowed) && 4247 !(bctl->meta.target & allowed))) 4248 reducing_redundancy = true; 4249 else 4250 reducing_redundancy = false; 4251 4252 /* if we're not converting, the target field is uninitialized */ 4253 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4254 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4255 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4256 bctl->data.target : fs_info->avail_data_alloc_bits; 4257 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4258 4259 if (reducing_redundancy) { 4260 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4261 btrfs_info(fs_info, 4262 "balance: force reducing metadata redundancy"); 4263 } else { 4264 btrfs_err(fs_info, 4265 "balance: reduces metadata redundancy, use --force if you want this"); 4266 ret = -EINVAL; 4267 goto out; 4268 } 4269 } 4270 4271 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4272 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4273 btrfs_warn(fs_info, 4274 "balance: metadata profile %s has lower redundancy than data profile %s", 4275 btrfs_bg_type_to_raid_name(meta_target), 4276 btrfs_bg_type_to_raid_name(data_target)); 4277 } 4278 4279 ret = insert_balance_item(fs_info, bctl); 4280 if (ret && ret != -EEXIST) 4281 goto out; 4282 4283 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4284 BUG_ON(ret == -EEXIST); 4285 BUG_ON(fs_info->balance_ctl); 4286 spin_lock(&fs_info->balance_lock); 4287 fs_info->balance_ctl = bctl; 4288 spin_unlock(&fs_info->balance_lock); 4289 } else { 4290 BUG_ON(ret != -EEXIST); 4291 spin_lock(&fs_info->balance_lock); 4292 update_balance_args(bctl); 4293 spin_unlock(&fs_info->balance_lock); 4294 } 4295 4296 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4297 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4298 describe_balance_start_or_resume(fs_info); 4299 mutex_unlock(&fs_info->balance_mutex); 4300 4301 ret = __btrfs_balance(fs_info); 4302 4303 mutex_lock(&fs_info->balance_mutex); 4304 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 4305 btrfs_info(fs_info, "balance: paused"); 4306 /* 4307 * Balance can be canceled by: 4308 * 4309 * - Regular cancel request 4310 * Then ret == -ECANCELED and balance_cancel_req > 0 4311 * 4312 * - Fatal signal to "btrfs" process 4313 * Either the signal caught by wait_reserve_ticket() and callers 4314 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4315 * got -ECANCELED. 4316 * Either way, in this case balance_cancel_req = 0, and 4317 * ret == -EINTR or ret == -ECANCELED. 4318 * 4319 * So here we only check the return value to catch canceled balance. 4320 */ 4321 else if (ret == -ECANCELED || ret == -EINTR) 4322 btrfs_info(fs_info, "balance: canceled"); 4323 else 4324 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4325 4326 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4327 4328 if (bargs) { 4329 memset(bargs, 0, sizeof(*bargs)); 4330 btrfs_update_ioctl_balance_args(fs_info, bargs); 4331 } 4332 4333 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4334 balance_need_close(fs_info)) { 4335 reset_balance_state(fs_info); 4336 btrfs_exclop_finish(fs_info); 4337 } 4338 4339 wake_up(&fs_info->balance_wait_q); 4340 4341 return ret; 4342 out: 4343 if (bctl->flags & BTRFS_BALANCE_RESUME) 4344 reset_balance_state(fs_info); 4345 else 4346 kfree(bctl); 4347 btrfs_exclop_finish(fs_info); 4348 4349 return ret; 4350 } 4351 4352 static int balance_kthread(void *data) 4353 { 4354 struct btrfs_fs_info *fs_info = data; 4355 int ret = 0; 4356 4357 mutex_lock(&fs_info->balance_mutex); 4358 if (fs_info->balance_ctl) 4359 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4360 mutex_unlock(&fs_info->balance_mutex); 4361 4362 return ret; 4363 } 4364 4365 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4366 { 4367 struct task_struct *tsk; 4368 4369 mutex_lock(&fs_info->balance_mutex); 4370 if (!fs_info->balance_ctl) { 4371 mutex_unlock(&fs_info->balance_mutex); 4372 return 0; 4373 } 4374 mutex_unlock(&fs_info->balance_mutex); 4375 4376 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4377 btrfs_info(fs_info, "balance: resume skipped"); 4378 return 0; 4379 } 4380 4381 /* 4382 * A ro->rw remount sequence should continue with the paused balance 4383 * regardless of who pauses it, system or the user as of now, so set 4384 * the resume flag. 4385 */ 4386 spin_lock(&fs_info->balance_lock); 4387 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4388 spin_unlock(&fs_info->balance_lock); 4389 4390 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4391 return PTR_ERR_OR_ZERO(tsk); 4392 } 4393 4394 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4395 { 4396 struct btrfs_balance_control *bctl; 4397 struct btrfs_balance_item *item; 4398 struct btrfs_disk_balance_args disk_bargs; 4399 struct btrfs_path *path; 4400 struct extent_buffer *leaf; 4401 struct btrfs_key key; 4402 int ret; 4403 4404 path = btrfs_alloc_path(); 4405 if (!path) 4406 return -ENOMEM; 4407 4408 key.objectid = BTRFS_BALANCE_OBJECTID; 4409 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4410 key.offset = 0; 4411 4412 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4413 if (ret < 0) 4414 goto out; 4415 if (ret > 0) { /* ret = -ENOENT; */ 4416 ret = 0; 4417 goto out; 4418 } 4419 4420 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4421 if (!bctl) { 4422 ret = -ENOMEM; 4423 goto out; 4424 } 4425 4426 leaf = path->nodes[0]; 4427 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4428 4429 bctl->flags = btrfs_balance_flags(leaf, item); 4430 bctl->flags |= BTRFS_BALANCE_RESUME; 4431 4432 btrfs_balance_data(leaf, item, &disk_bargs); 4433 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4434 btrfs_balance_meta(leaf, item, &disk_bargs); 4435 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4436 btrfs_balance_sys(leaf, item, &disk_bargs); 4437 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4438 4439 /* 4440 * This should never happen, as the paused balance state is recovered 4441 * during mount without any chance of other exclusive ops to collide. 4442 * 4443 * This gives the exclusive op status to balance and keeps in paused 4444 * state until user intervention (cancel or umount). If the ownership 4445 * cannot be assigned, show a message but do not fail. The balance 4446 * is in a paused state and must have fs_info::balance_ctl properly 4447 * set up. 4448 */ 4449 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 4450 btrfs_warn(fs_info, 4451 "balance: cannot set exclusive op status, resume manually"); 4452 4453 btrfs_release_path(path); 4454 4455 mutex_lock(&fs_info->balance_mutex); 4456 BUG_ON(fs_info->balance_ctl); 4457 spin_lock(&fs_info->balance_lock); 4458 fs_info->balance_ctl = bctl; 4459 spin_unlock(&fs_info->balance_lock); 4460 mutex_unlock(&fs_info->balance_mutex); 4461 out: 4462 btrfs_free_path(path); 4463 return ret; 4464 } 4465 4466 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4467 { 4468 int ret = 0; 4469 4470 mutex_lock(&fs_info->balance_mutex); 4471 if (!fs_info->balance_ctl) { 4472 mutex_unlock(&fs_info->balance_mutex); 4473 return -ENOTCONN; 4474 } 4475 4476 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4477 atomic_inc(&fs_info->balance_pause_req); 4478 mutex_unlock(&fs_info->balance_mutex); 4479 4480 wait_event(fs_info->balance_wait_q, 4481 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4482 4483 mutex_lock(&fs_info->balance_mutex); 4484 /* we are good with balance_ctl ripped off from under us */ 4485 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4486 atomic_dec(&fs_info->balance_pause_req); 4487 } else { 4488 ret = -ENOTCONN; 4489 } 4490 4491 mutex_unlock(&fs_info->balance_mutex); 4492 return ret; 4493 } 4494 4495 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4496 { 4497 mutex_lock(&fs_info->balance_mutex); 4498 if (!fs_info->balance_ctl) { 4499 mutex_unlock(&fs_info->balance_mutex); 4500 return -ENOTCONN; 4501 } 4502 4503 /* 4504 * A paused balance with the item stored on disk can be resumed at 4505 * mount time if the mount is read-write. Otherwise it's still paused 4506 * and we must not allow cancelling as it deletes the item. 4507 */ 4508 if (sb_rdonly(fs_info->sb)) { 4509 mutex_unlock(&fs_info->balance_mutex); 4510 return -EROFS; 4511 } 4512 4513 atomic_inc(&fs_info->balance_cancel_req); 4514 /* 4515 * if we are running just wait and return, balance item is 4516 * deleted in btrfs_balance in this case 4517 */ 4518 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4519 mutex_unlock(&fs_info->balance_mutex); 4520 wait_event(fs_info->balance_wait_q, 4521 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4522 mutex_lock(&fs_info->balance_mutex); 4523 } else { 4524 mutex_unlock(&fs_info->balance_mutex); 4525 /* 4526 * Lock released to allow other waiters to continue, we'll 4527 * reexamine the status again. 4528 */ 4529 mutex_lock(&fs_info->balance_mutex); 4530 4531 if (fs_info->balance_ctl) { 4532 reset_balance_state(fs_info); 4533 btrfs_exclop_finish(fs_info); 4534 btrfs_info(fs_info, "balance: canceled"); 4535 } 4536 } 4537 4538 BUG_ON(fs_info->balance_ctl || 4539 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4540 atomic_dec(&fs_info->balance_cancel_req); 4541 mutex_unlock(&fs_info->balance_mutex); 4542 return 0; 4543 } 4544 4545 int btrfs_uuid_scan_kthread(void *data) 4546 { 4547 struct btrfs_fs_info *fs_info = data; 4548 struct btrfs_root *root = fs_info->tree_root; 4549 struct btrfs_key key; 4550 struct btrfs_path *path = NULL; 4551 int ret = 0; 4552 struct extent_buffer *eb; 4553 int slot; 4554 struct btrfs_root_item root_item; 4555 u32 item_size; 4556 struct btrfs_trans_handle *trans = NULL; 4557 bool closing = false; 4558 4559 path = btrfs_alloc_path(); 4560 if (!path) { 4561 ret = -ENOMEM; 4562 goto out; 4563 } 4564 4565 key.objectid = 0; 4566 key.type = BTRFS_ROOT_ITEM_KEY; 4567 key.offset = 0; 4568 4569 while (1) { 4570 if (btrfs_fs_closing(fs_info)) { 4571 closing = true; 4572 break; 4573 } 4574 ret = btrfs_search_forward(root, &key, path, 4575 BTRFS_OLDEST_GENERATION); 4576 if (ret) { 4577 if (ret > 0) 4578 ret = 0; 4579 break; 4580 } 4581 4582 if (key.type != BTRFS_ROOT_ITEM_KEY || 4583 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4584 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4585 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4586 goto skip; 4587 4588 eb = path->nodes[0]; 4589 slot = path->slots[0]; 4590 item_size = btrfs_item_size_nr(eb, slot); 4591 if (item_size < sizeof(root_item)) 4592 goto skip; 4593 4594 read_extent_buffer(eb, &root_item, 4595 btrfs_item_ptr_offset(eb, slot), 4596 (int)sizeof(root_item)); 4597 if (btrfs_root_refs(&root_item) == 0) 4598 goto skip; 4599 4600 if (!btrfs_is_empty_uuid(root_item.uuid) || 4601 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4602 if (trans) 4603 goto update_tree; 4604 4605 btrfs_release_path(path); 4606 /* 4607 * 1 - subvol uuid item 4608 * 1 - received_subvol uuid item 4609 */ 4610 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4611 if (IS_ERR(trans)) { 4612 ret = PTR_ERR(trans); 4613 break; 4614 } 4615 continue; 4616 } else { 4617 goto skip; 4618 } 4619 update_tree: 4620 btrfs_release_path(path); 4621 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4622 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4623 BTRFS_UUID_KEY_SUBVOL, 4624 key.objectid); 4625 if (ret < 0) { 4626 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4627 ret); 4628 break; 4629 } 4630 } 4631 4632 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4633 ret = btrfs_uuid_tree_add(trans, 4634 root_item.received_uuid, 4635 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4636 key.objectid); 4637 if (ret < 0) { 4638 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4639 ret); 4640 break; 4641 } 4642 } 4643 4644 skip: 4645 btrfs_release_path(path); 4646 if (trans) { 4647 ret = btrfs_end_transaction(trans); 4648 trans = NULL; 4649 if (ret) 4650 break; 4651 } 4652 4653 if (key.offset < (u64)-1) { 4654 key.offset++; 4655 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4656 key.offset = 0; 4657 key.type = BTRFS_ROOT_ITEM_KEY; 4658 } else if (key.objectid < (u64)-1) { 4659 key.offset = 0; 4660 key.type = BTRFS_ROOT_ITEM_KEY; 4661 key.objectid++; 4662 } else { 4663 break; 4664 } 4665 cond_resched(); 4666 } 4667 4668 out: 4669 btrfs_free_path(path); 4670 if (trans && !IS_ERR(trans)) 4671 btrfs_end_transaction(trans); 4672 if (ret) 4673 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4674 else if (!closing) 4675 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4676 up(&fs_info->uuid_tree_rescan_sem); 4677 return 0; 4678 } 4679 4680 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4681 { 4682 struct btrfs_trans_handle *trans; 4683 struct btrfs_root *tree_root = fs_info->tree_root; 4684 struct btrfs_root *uuid_root; 4685 struct task_struct *task; 4686 int ret; 4687 4688 /* 4689 * 1 - root node 4690 * 1 - root item 4691 */ 4692 trans = btrfs_start_transaction(tree_root, 2); 4693 if (IS_ERR(trans)) 4694 return PTR_ERR(trans); 4695 4696 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4697 if (IS_ERR(uuid_root)) { 4698 ret = PTR_ERR(uuid_root); 4699 btrfs_abort_transaction(trans, ret); 4700 btrfs_end_transaction(trans); 4701 return ret; 4702 } 4703 4704 fs_info->uuid_root = uuid_root; 4705 4706 ret = btrfs_commit_transaction(trans); 4707 if (ret) 4708 return ret; 4709 4710 down(&fs_info->uuid_tree_rescan_sem); 4711 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4712 if (IS_ERR(task)) { 4713 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4714 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4715 up(&fs_info->uuid_tree_rescan_sem); 4716 return PTR_ERR(task); 4717 } 4718 4719 return 0; 4720 } 4721 4722 /* 4723 * shrinking a device means finding all of the device extents past 4724 * the new size, and then following the back refs to the chunks. 4725 * The chunk relocation code actually frees the device extent 4726 */ 4727 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4728 { 4729 struct btrfs_fs_info *fs_info = device->fs_info; 4730 struct btrfs_root *root = fs_info->dev_root; 4731 struct btrfs_trans_handle *trans; 4732 struct btrfs_dev_extent *dev_extent = NULL; 4733 struct btrfs_path *path; 4734 u64 length; 4735 u64 chunk_offset; 4736 int ret; 4737 int slot; 4738 int failed = 0; 4739 bool retried = false; 4740 struct extent_buffer *l; 4741 struct btrfs_key key; 4742 struct btrfs_super_block *super_copy = fs_info->super_copy; 4743 u64 old_total = btrfs_super_total_bytes(super_copy); 4744 u64 old_size = btrfs_device_get_total_bytes(device); 4745 u64 diff; 4746 u64 start; 4747 4748 new_size = round_down(new_size, fs_info->sectorsize); 4749 start = new_size; 4750 diff = round_down(old_size - new_size, fs_info->sectorsize); 4751 4752 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4753 return -EINVAL; 4754 4755 path = btrfs_alloc_path(); 4756 if (!path) 4757 return -ENOMEM; 4758 4759 path->reada = READA_BACK; 4760 4761 trans = btrfs_start_transaction(root, 0); 4762 if (IS_ERR(trans)) { 4763 btrfs_free_path(path); 4764 return PTR_ERR(trans); 4765 } 4766 4767 mutex_lock(&fs_info->chunk_mutex); 4768 4769 btrfs_device_set_total_bytes(device, new_size); 4770 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4771 device->fs_devices->total_rw_bytes -= diff; 4772 atomic64_sub(diff, &fs_info->free_chunk_space); 4773 } 4774 4775 /* 4776 * Once the device's size has been set to the new size, ensure all 4777 * in-memory chunks are synced to disk so that the loop below sees them 4778 * and relocates them accordingly. 4779 */ 4780 if (contains_pending_extent(device, &start, diff)) { 4781 mutex_unlock(&fs_info->chunk_mutex); 4782 ret = btrfs_commit_transaction(trans); 4783 if (ret) 4784 goto done; 4785 } else { 4786 mutex_unlock(&fs_info->chunk_mutex); 4787 btrfs_end_transaction(trans); 4788 } 4789 4790 again: 4791 key.objectid = device->devid; 4792 key.offset = (u64)-1; 4793 key.type = BTRFS_DEV_EXTENT_KEY; 4794 4795 do { 4796 mutex_lock(&fs_info->reclaim_bgs_lock); 4797 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4798 if (ret < 0) { 4799 mutex_unlock(&fs_info->reclaim_bgs_lock); 4800 goto done; 4801 } 4802 4803 ret = btrfs_previous_item(root, path, 0, key.type); 4804 if (ret) { 4805 mutex_unlock(&fs_info->reclaim_bgs_lock); 4806 if (ret < 0) 4807 goto done; 4808 ret = 0; 4809 btrfs_release_path(path); 4810 break; 4811 } 4812 4813 l = path->nodes[0]; 4814 slot = path->slots[0]; 4815 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4816 4817 if (key.objectid != device->devid) { 4818 mutex_unlock(&fs_info->reclaim_bgs_lock); 4819 btrfs_release_path(path); 4820 break; 4821 } 4822 4823 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4824 length = btrfs_dev_extent_length(l, dev_extent); 4825 4826 if (key.offset + length <= new_size) { 4827 mutex_unlock(&fs_info->reclaim_bgs_lock); 4828 btrfs_release_path(path); 4829 break; 4830 } 4831 4832 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4833 btrfs_release_path(path); 4834 4835 /* 4836 * We may be relocating the only data chunk we have, 4837 * which could potentially end up with losing data's 4838 * raid profile, so lets allocate an empty one in 4839 * advance. 4840 */ 4841 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4842 if (ret < 0) { 4843 mutex_unlock(&fs_info->reclaim_bgs_lock); 4844 goto done; 4845 } 4846 4847 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4848 mutex_unlock(&fs_info->reclaim_bgs_lock); 4849 if (ret == -ENOSPC) { 4850 failed++; 4851 } else if (ret) { 4852 if (ret == -ETXTBSY) { 4853 btrfs_warn(fs_info, 4854 "could not shrink block group %llu due to active swapfile", 4855 chunk_offset); 4856 } 4857 goto done; 4858 } 4859 } while (key.offset-- > 0); 4860 4861 if (failed && !retried) { 4862 failed = 0; 4863 retried = true; 4864 goto again; 4865 } else if (failed && retried) { 4866 ret = -ENOSPC; 4867 goto done; 4868 } 4869 4870 /* Shrinking succeeded, else we would be at "done". */ 4871 trans = btrfs_start_transaction(root, 0); 4872 if (IS_ERR(trans)) { 4873 ret = PTR_ERR(trans); 4874 goto done; 4875 } 4876 4877 mutex_lock(&fs_info->chunk_mutex); 4878 /* Clear all state bits beyond the shrunk device size */ 4879 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4880 CHUNK_STATE_MASK); 4881 4882 btrfs_device_set_disk_total_bytes(device, new_size); 4883 if (list_empty(&device->post_commit_list)) 4884 list_add_tail(&device->post_commit_list, 4885 &trans->transaction->dev_update_list); 4886 4887 WARN_ON(diff > old_total); 4888 btrfs_set_super_total_bytes(super_copy, 4889 round_down(old_total - diff, fs_info->sectorsize)); 4890 mutex_unlock(&fs_info->chunk_mutex); 4891 4892 /* Now btrfs_update_device() will change the on-disk size. */ 4893 ret = btrfs_update_device(trans, device); 4894 if (ret < 0) { 4895 btrfs_abort_transaction(trans, ret); 4896 btrfs_end_transaction(trans); 4897 } else { 4898 ret = btrfs_commit_transaction(trans); 4899 } 4900 done: 4901 btrfs_free_path(path); 4902 if (ret) { 4903 mutex_lock(&fs_info->chunk_mutex); 4904 btrfs_device_set_total_bytes(device, old_size); 4905 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4906 device->fs_devices->total_rw_bytes += diff; 4907 atomic64_add(diff, &fs_info->free_chunk_space); 4908 mutex_unlock(&fs_info->chunk_mutex); 4909 } 4910 return ret; 4911 } 4912 4913 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4914 struct btrfs_key *key, 4915 struct btrfs_chunk *chunk, int item_size) 4916 { 4917 struct btrfs_super_block *super_copy = fs_info->super_copy; 4918 struct btrfs_disk_key disk_key; 4919 u32 array_size; 4920 u8 *ptr; 4921 4922 lockdep_assert_held(&fs_info->chunk_mutex); 4923 4924 array_size = btrfs_super_sys_array_size(super_copy); 4925 if (array_size + item_size + sizeof(disk_key) 4926 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4927 return -EFBIG; 4928 4929 ptr = super_copy->sys_chunk_array + array_size; 4930 btrfs_cpu_key_to_disk(&disk_key, key); 4931 memcpy(ptr, &disk_key, sizeof(disk_key)); 4932 ptr += sizeof(disk_key); 4933 memcpy(ptr, chunk, item_size); 4934 item_size += sizeof(disk_key); 4935 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4936 4937 return 0; 4938 } 4939 4940 /* 4941 * sort the devices in descending order by max_avail, total_avail 4942 */ 4943 static int btrfs_cmp_device_info(const void *a, const void *b) 4944 { 4945 const struct btrfs_device_info *di_a = a; 4946 const struct btrfs_device_info *di_b = b; 4947 4948 if (di_a->max_avail > di_b->max_avail) 4949 return -1; 4950 if (di_a->max_avail < di_b->max_avail) 4951 return 1; 4952 if (di_a->total_avail > di_b->total_avail) 4953 return -1; 4954 if (di_a->total_avail < di_b->total_avail) 4955 return 1; 4956 return 0; 4957 } 4958 4959 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4960 { 4961 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4962 return; 4963 4964 btrfs_set_fs_incompat(info, RAID56); 4965 } 4966 4967 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4968 { 4969 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4970 return; 4971 4972 btrfs_set_fs_incompat(info, RAID1C34); 4973 } 4974 4975 /* 4976 * Structure used internally for __btrfs_alloc_chunk() function. 4977 * Wraps needed parameters. 4978 */ 4979 struct alloc_chunk_ctl { 4980 u64 start; 4981 u64 type; 4982 /* Total number of stripes to allocate */ 4983 int num_stripes; 4984 /* sub_stripes info for map */ 4985 int sub_stripes; 4986 /* Stripes per device */ 4987 int dev_stripes; 4988 /* Maximum number of devices to use */ 4989 int devs_max; 4990 /* Minimum number of devices to use */ 4991 int devs_min; 4992 /* ndevs has to be a multiple of this */ 4993 int devs_increment; 4994 /* Number of copies */ 4995 int ncopies; 4996 /* Number of stripes worth of bytes to store parity information */ 4997 int nparity; 4998 u64 max_stripe_size; 4999 u64 max_chunk_size; 5000 u64 dev_extent_min; 5001 u64 stripe_size; 5002 u64 chunk_size; 5003 int ndevs; 5004 }; 5005 5006 static void init_alloc_chunk_ctl_policy_regular( 5007 struct btrfs_fs_devices *fs_devices, 5008 struct alloc_chunk_ctl *ctl) 5009 { 5010 u64 type = ctl->type; 5011 5012 if (type & BTRFS_BLOCK_GROUP_DATA) { 5013 ctl->max_stripe_size = SZ_1G; 5014 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 5015 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5016 /* For larger filesystems, use larger metadata chunks */ 5017 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 5018 ctl->max_stripe_size = SZ_1G; 5019 else 5020 ctl->max_stripe_size = SZ_256M; 5021 ctl->max_chunk_size = ctl->max_stripe_size; 5022 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5023 ctl->max_stripe_size = SZ_32M; 5024 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5025 ctl->devs_max = min_t(int, ctl->devs_max, 5026 BTRFS_MAX_DEVS_SYS_CHUNK); 5027 } else { 5028 BUG(); 5029 } 5030 5031 /* We don't want a chunk larger than 10% of writable space */ 5032 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5033 ctl->max_chunk_size); 5034 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5035 } 5036 5037 static void init_alloc_chunk_ctl_policy_zoned( 5038 struct btrfs_fs_devices *fs_devices, 5039 struct alloc_chunk_ctl *ctl) 5040 { 5041 u64 zone_size = fs_devices->fs_info->zone_size; 5042 u64 limit; 5043 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5044 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5045 u64 min_chunk_size = min_data_stripes * zone_size; 5046 u64 type = ctl->type; 5047 5048 ctl->max_stripe_size = zone_size; 5049 if (type & BTRFS_BLOCK_GROUP_DATA) { 5050 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5051 zone_size); 5052 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5053 ctl->max_chunk_size = ctl->max_stripe_size; 5054 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5055 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5056 ctl->devs_max = min_t(int, ctl->devs_max, 5057 BTRFS_MAX_DEVS_SYS_CHUNK); 5058 } else { 5059 BUG(); 5060 } 5061 5062 /* We don't want a chunk larger than 10% of writable space */ 5063 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5064 zone_size), 5065 min_chunk_size); 5066 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5067 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5068 } 5069 5070 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5071 struct alloc_chunk_ctl *ctl) 5072 { 5073 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5074 5075 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5076 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5077 ctl->devs_max = btrfs_raid_array[index].devs_max; 5078 if (!ctl->devs_max) 5079 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5080 ctl->devs_min = btrfs_raid_array[index].devs_min; 5081 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5082 ctl->ncopies = btrfs_raid_array[index].ncopies; 5083 ctl->nparity = btrfs_raid_array[index].nparity; 5084 ctl->ndevs = 0; 5085 5086 switch (fs_devices->chunk_alloc_policy) { 5087 case BTRFS_CHUNK_ALLOC_REGULAR: 5088 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5089 break; 5090 case BTRFS_CHUNK_ALLOC_ZONED: 5091 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5092 break; 5093 default: 5094 BUG(); 5095 } 5096 } 5097 5098 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5099 struct alloc_chunk_ctl *ctl, 5100 struct btrfs_device_info *devices_info) 5101 { 5102 struct btrfs_fs_info *info = fs_devices->fs_info; 5103 struct btrfs_device *device; 5104 u64 total_avail; 5105 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5106 int ret; 5107 int ndevs = 0; 5108 u64 max_avail; 5109 u64 dev_offset; 5110 5111 /* 5112 * in the first pass through the devices list, we gather information 5113 * about the available holes on each device. 5114 */ 5115 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5116 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5117 WARN(1, KERN_ERR 5118 "BTRFS: read-only device in alloc_list\n"); 5119 continue; 5120 } 5121 5122 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5123 &device->dev_state) || 5124 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5125 continue; 5126 5127 if (device->total_bytes > device->bytes_used) 5128 total_avail = device->total_bytes - device->bytes_used; 5129 else 5130 total_avail = 0; 5131 5132 /* If there is no space on this device, skip it. */ 5133 if (total_avail < ctl->dev_extent_min) 5134 continue; 5135 5136 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5137 &max_avail); 5138 if (ret && ret != -ENOSPC) 5139 return ret; 5140 5141 if (ret == 0) 5142 max_avail = dev_extent_want; 5143 5144 if (max_avail < ctl->dev_extent_min) { 5145 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5146 btrfs_debug(info, 5147 "%s: devid %llu has no free space, have=%llu want=%llu", 5148 __func__, device->devid, max_avail, 5149 ctl->dev_extent_min); 5150 continue; 5151 } 5152 5153 if (ndevs == fs_devices->rw_devices) { 5154 WARN(1, "%s: found more than %llu devices\n", 5155 __func__, fs_devices->rw_devices); 5156 break; 5157 } 5158 devices_info[ndevs].dev_offset = dev_offset; 5159 devices_info[ndevs].max_avail = max_avail; 5160 devices_info[ndevs].total_avail = total_avail; 5161 devices_info[ndevs].dev = device; 5162 ++ndevs; 5163 } 5164 ctl->ndevs = ndevs; 5165 5166 /* 5167 * now sort the devices by hole size / available space 5168 */ 5169 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5170 btrfs_cmp_device_info, NULL); 5171 5172 return 0; 5173 } 5174 5175 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5176 struct btrfs_device_info *devices_info) 5177 { 5178 /* Number of stripes that count for block group size */ 5179 int data_stripes; 5180 5181 /* 5182 * The primary goal is to maximize the number of stripes, so use as 5183 * many devices as possible, even if the stripes are not maximum sized. 5184 * 5185 * The DUP profile stores more than one stripe per device, the 5186 * max_avail is the total size so we have to adjust. 5187 */ 5188 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5189 ctl->dev_stripes); 5190 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5191 5192 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5193 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5194 5195 /* 5196 * Use the number of data stripes to figure out how big this chunk is 5197 * really going to be in terms of logical address space, and compare 5198 * that answer with the max chunk size. If it's higher, we try to 5199 * reduce stripe_size. 5200 */ 5201 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5202 /* 5203 * Reduce stripe_size, round it up to a 16MB boundary again and 5204 * then use it, unless it ends up being even bigger than the 5205 * previous value we had already. 5206 */ 5207 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5208 data_stripes), SZ_16M), 5209 ctl->stripe_size); 5210 } 5211 5212 /* Align to BTRFS_STRIPE_LEN */ 5213 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5214 ctl->chunk_size = ctl->stripe_size * data_stripes; 5215 5216 return 0; 5217 } 5218 5219 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5220 struct btrfs_device_info *devices_info) 5221 { 5222 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5223 /* Number of stripes that count for block group size */ 5224 int data_stripes; 5225 5226 /* 5227 * It should hold because: 5228 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5229 */ 5230 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5231 5232 ctl->stripe_size = zone_size; 5233 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5234 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5235 5236 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5237 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5238 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5239 ctl->stripe_size) + ctl->nparity, 5240 ctl->dev_stripes); 5241 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5242 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5243 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5244 } 5245 5246 ctl->chunk_size = ctl->stripe_size * data_stripes; 5247 5248 return 0; 5249 } 5250 5251 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5252 struct alloc_chunk_ctl *ctl, 5253 struct btrfs_device_info *devices_info) 5254 { 5255 struct btrfs_fs_info *info = fs_devices->fs_info; 5256 5257 /* 5258 * Round down to number of usable stripes, devs_increment can be any 5259 * number so we can't use round_down() that requires power of 2, while 5260 * rounddown is safe. 5261 */ 5262 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5263 5264 if (ctl->ndevs < ctl->devs_min) { 5265 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5266 btrfs_debug(info, 5267 "%s: not enough devices with free space: have=%d minimum required=%d", 5268 __func__, ctl->ndevs, ctl->devs_min); 5269 } 5270 return -ENOSPC; 5271 } 5272 5273 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5274 5275 switch (fs_devices->chunk_alloc_policy) { 5276 case BTRFS_CHUNK_ALLOC_REGULAR: 5277 return decide_stripe_size_regular(ctl, devices_info); 5278 case BTRFS_CHUNK_ALLOC_ZONED: 5279 return decide_stripe_size_zoned(ctl, devices_info); 5280 default: 5281 BUG(); 5282 } 5283 } 5284 5285 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5286 struct alloc_chunk_ctl *ctl, 5287 struct btrfs_device_info *devices_info) 5288 { 5289 struct btrfs_fs_info *info = trans->fs_info; 5290 struct map_lookup *map = NULL; 5291 struct extent_map_tree *em_tree; 5292 struct btrfs_block_group *block_group; 5293 struct extent_map *em; 5294 u64 start = ctl->start; 5295 u64 type = ctl->type; 5296 int ret; 5297 int i; 5298 int j; 5299 5300 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5301 if (!map) 5302 return ERR_PTR(-ENOMEM); 5303 map->num_stripes = ctl->num_stripes; 5304 5305 for (i = 0; i < ctl->ndevs; ++i) { 5306 for (j = 0; j < ctl->dev_stripes; ++j) { 5307 int s = i * ctl->dev_stripes + j; 5308 map->stripes[s].dev = devices_info[i].dev; 5309 map->stripes[s].physical = devices_info[i].dev_offset + 5310 j * ctl->stripe_size; 5311 } 5312 } 5313 map->stripe_len = BTRFS_STRIPE_LEN; 5314 map->io_align = BTRFS_STRIPE_LEN; 5315 map->io_width = BTRFS_STRIPE_LEN; 5316 map->type = type; 5317 map->sub_stripes = ctl->sub_stripes; 5318 5319 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5320 5321 em = alloc_extent_map(); 5322 if (!em) { 5323 kfree(map); 5324 return ERR_PTR(-ENOMEM); 5325 } 5326 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5327 em->map_lookup = map; 5328 em->start = start; 5329 em->len = ctl->chunk_size; 5330 em->block_start = 0; 5331 em->block_len = em->len; 5332 em->orig_block_len = ctl->stripe_size; 5333 5334 em_tree = &info->mapping_tree; 5335 write_lock(&em_tree->lock); 5336 ret = add_extent_mapping(em_tree, em, 0); 5337 if (ret) { 5338 write_unlock(&em_tree->lock); 5339 free_extent_map(em); 5340 return ERR_PTR(ret); 5341 } 5342 write_unlock(&em_tree->lock); 5343 5344 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5345 if (IS_ERR(block_group)) 5346 goto error_del_extent; 5347 5348 for (i = 0; i < map->num_stripes; i++) { 5349 struct btrfs_device *dev = map->stripes[i].dev; 5350 5351 btrfs_device_set_bytes_used(dev, 5352 dev->bytes_used + ctl->stripe_size); 5353 if (list_empty(&dev->post_commit_list)) 5354 list_add_tail(&dev->post_commit_list, 5355 &trans->transaction->dev_update_list); 5356 } 5357 5358 atomic64_sub(ctl->stripe_size * map->num_stripes, 5359 &info->free_chunk_space); 5360 5361 free_extent_map(em); 5362 check_raid56_incompat_flag(info, type); 5363 check_raid1c34_incompat_flag(info, type); 5364 5365 return block_group; 5366 5367 error_del_extent: 5368 write_lock(&em_tree->lock); 5369 remove_extent_mapping(em_tree, em); 5370 write_unlock(&em_tree->lock); 5371 5372 /* One for our allocation */ 5373 free_extent_map(em); 5374 /* One for the tree reference */ 5375 free_extent_map(em); 5376 5377 return block_group; 5378 } 5379 5380 struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 5381 u64 type) 5382 { 5383 struct btrfs_fs_info *info = trans->fs_info; 5384 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5385 struct btrfs_device_info *devices_info = NULL; 5386 struct alloc_chunk_ctl ctl; 5387 struct btrfs_block_group *block_group; 5388 int ret; 5389 5390 lockdep_assert_held(&info->chunk_mutex); 5391 5392 if (!alloc_profile_is_valid(type, 0)) { 5393 ASSERT(0); 5394 return ERR_PTR(-EINVAL); 5395 } 5396 5397 if (list_empty(&fs_devices->alloc_list)) { 5398 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5399 btrfs_debug(info, "%s: no writable device", __func__); 5400 return ERR_PTR(-ENOSPC); 5401 } 5402 5403 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5404 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5405 ASSERT(0); 5406 return ERR_PTR(-EINVAL); 5407 } 5408 5409 ctl.start = find_next_chunk(info); 5410 ctl.type = type; 5411 init_alloc_chunk_ctl(fs_devices, &ctl); 5412 5413 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5414 GFP_NOFS); 5415 if (!devices_info) 5416 return ERR_PTR(-ENOMEM); 5417 5418 ret = gather_device_info(fs_devices, &ctl, devices_info); 5419 if (ret < 0) { 5420 block_group = ERR_PTR(ret); 5421 goto out; 5422 } 5423 5424 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5425 if (ret < 0) { 5426 block_group = ERR_PTR(ret); 5427 goto out; 5428 } 5429 5430 block_group = create_chunk(trans, &ctl, devices_info); 5431 5432 out: 5433 kfree(devices_info); 5434 return block_group; 5435 } 5436 5437 /* 5438 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5439 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5440 * chunks. 5441 * 5442 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5443 * phases. 5444 */ 5445 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5446 struct btrfs_block_group *bg) 5447 { 5448 struct btrfs_fs_info *fs_info = trans->fs_info; 5449 struct btrfs_root *extent_root = fs_info->extent_root; 5450 struct btrfs_root *chunk_root = fs_info->chunk_root; 5451 struct btrfs_key key; 5452 struct btrfs_chunk *chunk; 5453 struct btrfs_stripe *stripe; 5454 struct extent_map *em; 5455 struct map_lookup *map; 5456 size_t item_size; 5457 int i; 5458 int ret; 5459 5460 /* 5461 * We take the chunk_mutex for 2 reasons: 5462 * 5463 * 1) Updates and insertions in the chunk btree must be done while holding 5464 * the chunk_mutex, as well as updating the system chunk array in the 5465 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5466 * details; 5467 * 5468 * 2) To prevent races with the final phase of a device replace operation 5469 * that replaces the device object associated with the map's stripes, 5470 * because the device object's id can change at any time during that 5471 * final phase of the device replace operation 5472 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5473 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5474 * which would cause a failure when updating the device item, which does 5475 * not exists, or persisting a stripe of the chunk item with such ID. 5476 * Here we can't use the device_list_mutex because our caller already 5477 * has locked the chunk_mutex, and the final phase of device replace 5478 * acquires both mutexes - first the device_list_mutex and then the 5479 * chunk_mutex. Using any of those two mutexes protects us from a 5480 * concurrent device replace. 5481 */ 5482 lockdep_assert_held(&fs_info->chunk_mutex); 5483 5484 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5485 if (IS_ERR(em)) { 5486 ret = PTR_ERR(em); 5487 btrfs_abort_transaction(trans, ret); 5488 return ret; 5489 } 5490 5491 map = em->map_lookup; 5492 item_size = btrfs_chunk_item_size(map->num_stripes); 5493 5494 chunk = kzalloc(item_size, GFP_NOFS); 5495 if (!chunk) { 5496 ret = -ENOMEM; 5497 btrfs_abort_transaction(trans, ret); 5498 goto out; 5499 } 5500 5501 for (i = 0; i < map->num_stripes; i++) { 5502 struct btrfs_device *device = map->stripes[i].dev; 5503 5504 ret = btrfs_update_device(trans, device); 5505 if (ret) 5506 goto out; 5507 } 5508 5509 stripe = &chunk->stripe; 5510 for (i = 0; i < map->num_stripes; i++) { 5511 struct btrfs_device *device = map->stripes[i].dev; 5512 const u64 dev_offset = map->stripes[i].physical; 5513 5514 btrfs_set_stack_stripe_devid(stripe, device->devid); 5515 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5516 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5517 stripe++; 5518 } 5519 5520 btrfs_set_stack_chunk_length(chunk, bg->length); 5521 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5522 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5523 btrfs_set_stack_chunk_type(chunk, map->type); 5524 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5525 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5526 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5527 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5528 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5529 5530 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5531 key.type = BTRFS_CHUNK_ITEM_KEY; 5532 key.offset = bg->start; 5533 5534 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5535 if (ret) 5536 goto out; 5537 5538 bg->chunk_item_inserted = 1; 5539 5540 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5541 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5542 if (ret) 5543 goto out; 5544 } 5545 5546 out: 5547 kfree(chunk); 5548 free_extent_map(em); 5549 return ret; 5550 } 5551 5552 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5553 { 5554 struct btrfs_fs_info *fs_info = trans->fs_info; 5555 u64 alloc_profile; 5556 struct btrfs_block_group *meta_bg; 5557 struct btrfs_block_group *sys_bg; 5558 5559 /* 5560 * When adding a new device for sprouting, the seed device is read-only 5561 * so we must first allocate a metadata and a system chunk. But before 5562 * adding the block group items to the extent, device and chunk btrees, 5563 * we must first: 5564 * 5565 * 1) Create both chunks without doing any changes to the btrees, as 5566 * otherwise we would get -ENOSPC since the block groups from the 5567 * seed device are read-only; 5568 * 5569 * 2) Add the device item for the new sprout device - finishing the setup 5570 * of a new block group requires updating the device item in the chunk 5571 * btree, so it must exist when we attempt to do it. The previous step 5572 * ensures this does not fail with -ENOSPC. 5573 * 5574 * After that we can add the block group items to their btrees: 5575 * update existing device item in the chunk btree, add a new block group 5576 * item to the extent btree, add a new chunk item to the chunk btree and 5577 * finally add the new device extent items to the devices btree. 5578 */ 5579 5580 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5581 meta_bg = btrfs_alloc_chunk(trans, alloc_profile); 5582 if (IS_ERR(meta_bg)) 5583 return PTR_ERR(meta_bg); 5584 5585 alloc_profile = btrfs_system_alloc_profile(fs_info); 5586 sys_bg = btrfs_alloc_chunk(trans, alloc_profile); 5587 if (IS_ERR(sys_bg)) 5588 return PTR_ERR(sys_bg); 5589 5590 return 0; 5591 } 5592 5593 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5594 { 5595 const int index = btrfs_bg_flags_to_raid_index(map->type); 5596 5597 return btrfs_raid_array[index].tolerated_failures; 5598 } 5599 5600 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5601 { 5602 struct extent_map *em; 5603 struct map_lookup *map; 5604 int readonly = 0; 5605 int miss_ndevs = 0; 5606 int i; 5607 5608 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5609 if (IS_ERR(em)) 5610 return 1; 5611 5612 map = em->map_lookup; 5613 for (i = 0; i < map->num_stripes; i++) { 5614 if (test_bit(BTRFS_DEV_STATE_MISSING, 5615 &map->stripes[i].dev->dev_state)) { 5616 miss_ndevs++; 5617 continue; 5618 } 5619 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5620 &map->stripes[i].dev->dev_state)) { 5621 readonly = 1; 5622 goto end; 5623 } 5624 } 5625 5626 /* 5627 * If the number of missing devices is larger than max errors, 5628 * we can not write the data into that chunk successfully, so 5629 * set it readonly. 5630 */ 5631 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5632 readonly = 1; 5633 end: 5634 free_extent_map(em); 5635 return readonly; 5636 } 5637 5638 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5639 { 5640 struct extent_map *em; 5641 5642 while (1) { 5643 write_lock(&tree->lock); 5644 em = lookup_extent_mapping(tree, 0, (u64)-1); 5645 if (em) 5646 remove_extent_mapping(tree, em); 5647 write_unlock(&tree->lock); 5648 if (!em) 5649 break; 5650 /* once for us */ 5651 free_extent_map(em); 5652 /* once for the tree */ 5653 free_extent_map(em); 5654 } 5655 } 5656 5657 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5658 { 5659 struct extent_map *em; 5660 struct map_lookup *map; 5661 int ret; 5662 5663 em = btrfs_get_chunk_map(fs_info, logical, len); 5664 if (IS_ERR(em)) 5665 /* 5666 * We could return errors for these cases, but that could get 5667 * ugly and we'd probably do the same thing which is just not do 5668 * anything else and exit, so return 1 so the callers don't try 5669 * to use other copies. 5670 */ 5671 return 1; 5672 5673 map = em->map_lookup; 5674 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5675 ret = map->num_stripes; 5676 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5677 ret = map->sub_stripes; 5678 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5679 ret = 2; 5680 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5681 /* 5682 * There could be two corrupted data stripes, we need 5683 * to loop retry in order to rebuild the correct data. 5684 * 5685 * Fail a stripe at a time on every retry except the 5686 * stripe under reconstruction. 5687 */ 5688 ret = map->num_stripes; 5689 else 5690 ret = 1; 5691 free_extent_map(em); 5692 5693 down_read(&fs_info->dev_replace.rwsem); 5694 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5695 fs_info->dev_replace.tgtdev) 5696 ret++; 5697 up_read(&fs_info->dev_replace.rwsem); 5698 5699 return ret; 5700 } 5701 5702 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5703 u64 logical) 5704 { 5705 struct extent_map *em; 5706 struct map_lookup *map; 5707 unsigned long len = fs_info->sectorsize; 5708 5709 em = btrfs_get_chunk_map(fs_info, logical, len); 5710 5711 if (!WARN_ON(IS_ERR(em))) { 5712 map = em->map_lookup; 5713 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5714 len = map->stripe_len * nr_data_stripes(map); 5715 free_extent_map(em); 5716 } 5717 return len; 5718 } 5719 5720 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5721 { 5722 struct extent_map *em; 5723 struct map_lookup *map; 5724 int ret = 0; 5725 5726 em = btrfs_get_chunk_map(fs_info, logical, len); 5727 5728 if(!WARN_ON(IS_ERR(em))) { 5729 map = em->map_lookup; 5730 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5731 ret = 1; 5732 free_extent_map(em); 5733 } 5734 return ret; 5735 } 5736 5737 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5738 struct map_lookup *map, int first, 5739 int dev_replace_is_ongoing) 5740 { 5741 int i; 5742 int num_stripes; 5743 int preferred_mirror; 5744 int tolerance; 5745 struct btrfs_device *srcdev; 5746 5747 ASSERT((map->type & 5748 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5749 5750 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5751 num_stripes = map->sub_stripes; 5752 else 5753 num_stripes = map->num_stripes; 5754 5755 switch (fs_info->fs_devices->read_policy) { 5756 default: 5757 /* Shouldn't happen, just warn and use pid instead of failing */ 5758 btrfs_warn_rl(fs_info, 5759 "unknown read_policy type %u, reset to pid", 5760 fs_info->fs_devices->read_policy); 5761 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5762 fallthrough; 5763 case BTRFS_READ_POLICY_PID: 5764 preferred_mirror = first + (current->pid % num_stripes); 5765 break; 5766 } 5767 5768 if (dev_replace_is_ongoing && 5769 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5770 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5771 srcdev = fs_info->dev_replace.srcdev; 5772 else 5773 srcdev = NULL; 5774 5775 /* 5776 * try to avoid the drive that is the source drive for a 5777 * dev-replace procedure, only choose it if no other non-missing 5778 * mirror is available 5779 */ 5780 for (tolerance = 0; tolerance < 2; tolerance++) { 5781 if (map->stripes[preferred_mirror].dev->bdev && 5782 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5783 return preferred_mirror; 5784 for (i = first; i < first + num_stripes; i++) { 5785 if (map->stripes[i].dev->bdev && 5786 (tolerance || map->stripes[i].dev != srcdev)) 5787 return i; 5788 } 5789 } 5790 5791 /* we couldn't find one that doesn't fail. Just return something 5792 * and the io error handling code will clean up eventually 5793 */ 5794 return preferred_mirror; 5795 } 5796 5797 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5798 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5799 { 5800 int i; 5801 int again = 1; 5802 5803 while (again) { 5804 again = 0; 5805 for (i = 0; i < num_stripes - 1; i++) { 5806 /* Swap if parity is on a smaller index */ 5807 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { 5808 swap(bbio->stripes[i], bbio->stripes[i + 1]); 5809 swap(bbio->raid_map[i], bbio->raid_map[i + 1]); 5810 again = 1; 5811 } 5812 } 5813 } 5814 } 5815 5816 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5817 { 5818 struct btrfs_bio *bbio = kzalloc( 5819 /* the size of the btrfs_bio */ 5820 sizeof(struct btrfs_bio) + 5821 /* plus the variable array for the stripes */ 5822 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5823 /* plus the variable array for the tgt dev */ 5824 sizeof(int) * (real_stripes) + 5825 /* 5826 * plus the raid_map, which includes both the tgt dev 5827 * and the stripes 5828 */ 5829 sizeof(u64) * (total_stripes), 5830 GFP_NOFS|__GFP_NOFAIL); 5831 5832 atomic_set(&bbio->error, 0); 5833 refcount_set(&bbio->refs, 1); 5834 5835 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); 5836 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); 5837 5838 return bbio; 5839 } 5840 5841 void btrfs_get_bbio(struct btrfs_bio *bbio) 5842 { 5843 WARN_ON(!refcount_read(&bbio->refs)); 5844 refcount_inc(&bbio->refs); 5845 } 5846 5847 void btrfs_put_bbio(struct btrfs_bio *bbio) 5848 { 5849 if (!bbio) 5850 return; 5851 if (refcount_dec_and_test(&bbio->refs)) 5852 kfree(bbio); 5853 } 5854 5855 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5856 /* 5857 * Please note that, discard won't be sent to target device of device 5858 * replace. 5859 */ 5860 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5861 u64 logical, u64 *length_ret, 5862 struct btrfs_bio **bbio_ret) 5863 { 5864 struct extent_map *em; 5865 struct map_lookup *map; 5866 struct btrfs_bio *bbio; 5867 u64 length = *length_ret; 5868 u64 offset; 5869 u64 stripe_nr; 5870 u64 stripe_nr_end; 5871 u64 stripe_end_offset; 5872 u64 stripe_cnt; 5873 u64 stripe_len; 5874 u64 stripe_offset; 5875 u64 num_stripes; 5876 u32 stripe_index; 5877 u32 factor = 0; 5878 u32 sub_stripes = 0; 5879 u64 stripes_per_dev = 0; 5880 u32 remaining_stripes = 0; 5881 u32 last_stripe = 0; 5882 int ret = 0; 5883 int i; 5884 5885 /* discard always return a bbio */ 5886 ASSERT(bbio_ret); 5887 5888 em = btrfs_get_chunk_map(fs_info, logical, length); 5889 if (IS_ERR(em)) 5890 return PTR_ERR(em); 5891 5892 map = em->map_lookup; 5893 /* we don't discard raid56 yet */ 5894 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5895 ret = -EOPNOTSUPP; 5896 goto out; 5897 } 5898 5899 offset = logical - em->start; 5900 length = min_t(u64, em->start + em->len - logical, length); 5901 *length_ret = length; 5902 5903 stripe_len = map->stripe_len; 5904 /* 5905 * stripe_nr counts the total number of stripes we have to stride 5906 * to get to this block 5907 */ 5908 stripe_nr = div64_u64(offset, stripe_len); 5909 5910 /* stripe_offset is the offset of this block in its stripe */ 5911 stripe_offset = offset - stripe_nr * stripe_len; 5912 5913 stripe_nr_end = round_up(offset + length, map->stripe_len); 5914 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5915 stripe_cnt = stripe_nr_end - stripe_nr; 5916 stripe_end_offset = stripe_nr_end * map->stripe_len - 5917 (offset + length); 5918 /* 5919 * after this, stripe_nr is the number of stripes on this 5920 * device we have to walk to find the data, and stripe_index is 5921 * the number of our device in the stripe array 5922 */ 5923 num_stripes = 1; 5924 stripe_index = 0; 5925 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5926 BTRFS_BLOCK_GROUP_RAID10)) { 5927 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5928 sub_stripes = 1; 5929 else 5930 sub_stripes = map->sub_stripes; 5931 5932 factor = map->num_stripes / sub_stripes; 5933 num_stripes = min_t(u64, map->num_stripes, 5934 sub_stripes * stripe_cnt); 5935 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5936 stripe_index *= sub_stripes; 5937 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5938 &remaining_stripes); 5939 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5940 last_stripe *= sub_stripes; 5941 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 5942 BTRFS_BLOCK_GROUP_DUP)) { 5943 num_stripes = map->num_stripes; 5944 } else { 5945 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5946 &stripe_index); 5947 } 5948 5949 bbio = alloc_btrfs_bio(num_stripes, 0); 5950 if (!bbio) { 5951 ret = -ENOMEM; 5952 goto out; 5953 } 5954 5955 for (i = 0; i < num_stripes; i++) { 5956 bbio->stripes[i].physical = 5957 map->stripes[stripe_index].physical + 5958 stripe_offset + stripe_nr * map->stripe_len; 5959 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5960 5961 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5962 BTRFS_BLOCK_GROUP_RAID10)) { 5963 bbio->stripes[i].length = stripes_per_dev * 5964 map->stripe_len; 5965 5966 if (i / sub_stripes < remaining_stripes) 5967 bbio->stripes[i].length += 5968 map->stripe_len; 5969 5970 /* 5971 * Special for the first stripe and 5972 * the last stripe: 5973 * 5974 * |-------|...|-------| 5975 * |----------| 5976 * off end_off 5977 */ 5978 if (i < sub_stripes) 5979 bbio->stripes[i].length -= 5980 stripe_offset; 5981 5982 if (stripe_index >= last_stripe && 5983 stripe_index <= (last_stripe + 5984 sub_stripes - 1)) 5985 bbio->stripes[i].length -= 5986 stripe_end_offset; 5987 5988 if (i == sub_stripes - 1) 5989 stripe_offset = 0; 5990 } else { 5991 bbio->stripes[i].length = length; 5992 } 5993 5994 stripe_index++; 5995 if (stripe_index == map->num_stripes) { 5996 stripe_index = 0; 5997 stripe_nr++; 5998 } 5999 } 6000 6001 *bbio_ret = bbio; 6002 bbio->map_type = map->type; 6003 bbio->num_stripes = num_stripes; 6004 out: 6005 free_extent_map(em); 6006 return ret; 6007 } 6008 6009 /* 6010 * In dev-replace case, for repair case (that's the only case where the mirror 6011 * is selected explicitly when calling btrfs_map_block), blocks left of the 6012 * left cursor can also be read from the target drive. 6013 * 6014 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6015 * array of stripes. 6016 * For READ, it also needs to be supported using the same mirror number. 6017 * 6018 * If the requested block is not left of the left cursor, EIO is returned. This 6019 * can happen because btrfs_num_copies() returns one more in the dev-replace 6020 * case. 6021 */ 6022 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6023 u64 logical, u64 length, 6024 u64 srcdev_devid, int *mirror_num, 6025 u64 *physical) 6026 { 6027 struct btrfs_bio *bbio = NULL; 6028 int num_stripes; 6029 int index_srcdev = 0; 6030 int found = 0; 6031 u64 physical_of_found = 0; 6032 int i; 6033 int ret = 0; 6034 6035 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6036 logical, &length, &bbio, 0, 0); 6037 if (ret) { 6038 ASSERT(bbio == NULL); 6039 return ret; 6040 } 6041 6042 num_stripes = bbio->num_stripes; 6043 if (*mirror_num > num_stripes) { 6044 /* 6045 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6046 * that means that the requested area is not left of the left 6047 * cursor 6048 */ 6049 btrfs_put_bbio(bbio); 6050 return -EIO; 6051 } 6052 6053 /* 6054 * process the rest of the function using the mirror_num of the source 6055 * drive. Therefore look it up first. At the end, patch the device 6056 * pointer to the one of the target drive. 6057 */ 6058 for (i = 0; i < num_stripes; i++) { 6059 if (bbio->stripes[i].dev->devid != srcdev_devid) 6060 continue; 6061 6062 /* 6063 * In case of DUP, in order to keep it simple, only add the 6064 * mirror with the lowest physical address 6065 */ 6066 if (found && 6067 physical_of_found <= bbio->stripes[i].physical) 6068 continue; 6069 6070 index_srcdev = i; 6071 found = 1; 6072 physical_of_found = bbio->stripes[i].physical; 6073 } 6074 6075 btrfs_put_bbio(bbio); 6076 6077 ASSERT(found); 6078 if (!found) 6079 return -EIO; 6080 6081 *mirror_num = index_srcdev + 1; 6082 *physical = physical_of_found; 6083 return ret; 6084 } 6085 6086 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6087 { 6088 struct btrfs_block_group *cache; 6089 bool ret; 6090 6091 /* Non zoned filesystem does not use "to_copy" flag */ 6092 if (!btrfs_is_zoned(fs_info)) 6093 return false; 6094 6095 cache = btrfs_lookup_block_group(fs_info, logical); 6096 6097 spin_lock(&cache->lock); 6098 ret = cache->to_copy; 6099 spin_unlock(&cache->lock); 6100 6101 btrfs_put_block_group(cache); 6102 return ret; 6103 } 6104 6105 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6106 struct btrfs_bio **bbio_ret, 6107 struct btrfs_dev_replace *dev_replace, 6108 u64 logical, 6109 int *num_stripes_ret, int *max_errors_ret) 6110 { 6111 struct btrfs_bio *bbio = *bbio_ret; 6112 u64 srcdev_devid = dev_replace->srcdev->devid; 6113 int tgtdev_indexes = 0; 6114 int num_stripes = *num_stripes_ret; 6115 int max_errors = *max_errors_ret; 6116 int i; 6117 6118 if (op == BTRFS_MAP_WRITE) { 6119 int index_where_to_add; 6120 6121 /* 6122 * A block group which have "to_copy" set will eventually 6123 * copied by dev-replace process. We can avoid cloning IO here. 6124 */ 6125 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6126 return; 6127 6128 /* 6129 * duplicate the write operations while the dev replace 6130 * procedure is running. Since the copying of the old disk to 6131 * the new disk takes place at run time while the filesystem is 6132 * mounted writable, the regular write operations to the old 6133 * disk have to be duplicated to go to the new disk as well. 6134 * 6135 * Note that device->missing is handled by the caller, and that 6136 * the write to the old disk is already set up in the stripes 6137 * array. 6138 */ 6139 index_where_to_add = num_stripes; 6140 for (i = 0; i < num_stripes; i++) { 6141 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6142 /* write to new disk, too */ 6143 struct btrfs_bio_stripe *new = 6144 bbio->stripes + index_where_to_add; 6145 struct btrfs_bio_stripe *old = 6146 bbio->stripes + i; 6147 6148 new->physical = old->physical; 6149 new->length = old->length; 6150 new->dev = dev_replace->tgtdev; 6151 bbio->tgtdev_map[i] = index_where_to_add; 6152 index_where_to_add++; 6153 max_errors++; 6154 tgtdev_indexes++; 6155 } 6156 } 6157 num_stripes = index_where_to_add; 6158 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6159 int index_srcdev = 0; 6160 int found = 0; 6161 u64 physical_of_found = 0; 6162 6163 /* 6164 * During the dev-replace procedure, the target drive can also 6165 * be used to read data in case it is needed to repair a corrupt 6166 * block elsewhere. This is possible if the requested area is 6167 * left of the left cursor. In this area, the target drive is a 6168 * full copy of the source drive. 6169 */ 6170 for (i = 0; i < num_stripes; i++) { 6171 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6172 /* 6173 * In case of DUP, in order to keep it simple, 6174 * only add the mirror with the lowest physical 6175 * address 6176 */ 6177 if (found && 6178 physical_of_found <= 6179 bbio->stripes[i].physical) 6180 continue; 6181 index_srcdev = i; 6182 found = 1; 6183 physical_of_found = bbio->stripes[i].physical; 6184 } 6185 } 6186 if (found) { 6187 struct btrfs_bio_stripe *tgtdev_stripe = 6188 bbio->stripes + num_stripes; 6189 6190 tgtdev_stripe->physical = physical_of_found; 6191 tgtdev_stripe->length = 6192 bbio->stripes[index_srcdev].length; 6193 tgtdev_stripe->dev = dev_replace->tgtdev; 6194 bbio->tgtdev_map[index_srcdev] = num_stripes; 6195 6196 tgtdev_indexes++; 6197 num_stripes++; 6198 } 6199 } 6200 6201 *num_stripes_ret = num_stripes; 6202 *max_errors_ret = max_errors; 6203 bbio->num_tgtdevs = tgtdev_indexes; 6204 *bbio_ret = bbio; 6205 } 6206 6207 static bool need_full_stripe(enum btrfs_map_op op) 6208 { 6209 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6210 } 6211 6212 /* 6213 * Calculate the geometry of a particular (address, len) tuple. This 6214 * information is used to calculate how big a particular bio can get before it 6215 * straddles a stripe. 6216 * 6217 * @fs_info: the filesystem 6218 * @em: mapping containing the logical extent 6219 * @op: type of operation - write or read 6220 * @logical: address that we want to figure out the geometry of 6221 * @io_geom: pointer used to return values 6222 * 6223 * Returns < 0 in case a chunk for the given logical address cannot be found, 6224 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6225 */ 6226 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6227 enum btrfs_map_op op, u64 logical, 6228 struct btrfs_io_geometry *io_geom) 6229 { 6230 struct map_lookup *map; 6231 u64 len; 6232 u64 offset; 6233 u64 stripe_offset; 6234 u64 stripe_nr; 6235 u64 stripe_len; 6236 u64 raid56_full_stripe_start = (u64)-1; 6237 int data_stripes; 6238 6239 ASSERT(op != BTRFS_MAP_DISCARD); 6240 6241 map = em->map_lookup; 6242 /* Offset of this logical address in the chunk */ 6243 offset = logical - em->start; 6244 /* Len of a stripe in a chunk */ 6245 stripe_len = map->stripe_len; 6246 /* Stripe where this block falls in */ 6247 stripe_nr = div64_u64(offset, stripe_len); 6248 /* Offset of stripe in the chunk */ 6249 stripe_offset = stripe_nr * stripe_len; 6250 if (offset < stripe_offset) { 6251 btrfs_crit(fs_info, 6252 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6253 stripe_offset, offset, em->start, logical, stripe_len); 6254 return -EINVAL; 6255 } 6256 6257 /* stripe_offset is the offset of this block in its stripe */ 6258 stripe_offset = offset - stripe_offset; 6259 data_stripes = nr_data_stripes(map); 6260 6261 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6262 u64 max_len = stripe_len - stripe_offset; 6263 6264 /* 6265 * In case of raid56, we need to know the stripe aligned start 6266 */ 6267 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6268 unsigned long full_stripe_len = stripe_len * data_stripes; 6269 raid56_full_stripe_start = offset; 6270 6271 /* 6272 * Allow a write of a full stripe, but make sure we 6273 * don't allow straddling of stripes 6274 */ 6275 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6276 full_stripe_len); 6277 raid56_full_stripe_start *= full_stripe_len; 6278 6279 /* 6280 * For writes to RAID[56], allow a full stripeset across 6281 * all disks. For other RAID types and for RAID[56] 6282 * reads, just allow a single stripe (on a single disk). 6283 */ 6284 if (op == BTRFS_MAP_WRITE) { 6285 max_len = stripe_len * data_stripes - 6286 (offset - raid56_full_stripe_start); 6287 } 6288 } 6289 len = min_t(u64, em->len - offset, max_len); 6290 } else { 6291 len = em->len - offset; 6292 } 6293 6294 io_geom->len = len; 6295 io_geom->offset = offset; 6296 io_geom->stripe_len = stripe_len; 6297 io_geom->stripe_nr = stripe_nr; 6298 io_geom->stripe_offset = stripe_offset; 6299 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6300 6301 return 0; 6302 } 6303 6304 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6305 enum btrfs_map_op op, 6306 u64 logical, u64 *length, 6307 struct btrfs_bio **bbio_ret, 6308 int mirror_num, int need_raid_map) 6309 { 6310 struct extent_map *em; 6311 struct map_lookup *map; 6312 u64 stripe_offset; 6313 u64 stripe_nr; 6314 u64 stripe_len; 6315 u32 stripe_index; 6316 int data_stripes; 6317 int i; 6318 int ret = 0; 6319 int num_stripes; 6320 int max_errors = 0; 6321 int tgtdev_indexes = 0; 6322 struct btrfs_bio *bbio = NULL; 6323 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6324 int dev_replace_is_ongoing = 0; 6325 int num_alloc_stripes; 6326 int patch_the_first_stripe_for_dev_replace = 0; 6327 u64 physical_to_patch_in_first_stripe = 0; 6328 u64 raid56_full_stripe_start = (u64)-1; 6329 struct btrfs_io_geometry geom; 6330 6331 ASSERT(bbio_ret); 6332 ASSERT(op != BTRFS_MAP_DISCARD); 6333 6334 em = btrfs_get_chunk_map(fs_info, logical, *length); 6335 ASSERT(!IS_ERR(em)); 6336 6337 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6338 if (ret < 0) 6339 return ret; 6340 6341 map = em->map_lookup; 6342 6343 *length = geom.len; 6344 stripe_len = geom.stripe_len; 6345 stripe_nr = geom.stripe_nr; 6346 stripe_offset = geom.stripe_offset; 6347 raid56_full_stripe_start = geom.raid56_stripe_offset; 6348 data_stripes = nr_data_stripes(map); 6349 6350 down_read(&dev_replace->rwsem); 6351 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6352 /* 6353 * Hold the semaphore for read during the whole operation, write is 6354 * requested at commit time but must wait. 6355 */ 6356 if (!dev_replace_is_ongoing) 6357 up_read(&dev_replace->rwsem); 6358 6359 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6360 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6361 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6362 dev_replace->srcdev->devid, 6363 &mirror_num, 6364 &physical_to_patch_in_first_stripe); 6365 if (ret) 6366 goto out; 6367 else 6368 patch_the_first_stripe_for_dev_replace = 1; 6369 } else if (mirror_num > map->num_stripes) { 6370 mirror_num = 0; 6371 } 6372 6373 num_stripes = 1; 6374 stripe_index = 0; 6375 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6376 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6377 &stripe_index); 6378 if (!need_full_stripe(op)) 6379 mirror_num = 1; 6380 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6381 if (need_full_stripe(op)) 6382 num_stripes = map->num_stripes; 6383 else if (mirror_num) 6384 stripe_index = mirror_num - 1; 6385 else { 6386 stripe_index = find_live_mirror(fs_info, map, 0, 6387 dev_replace_is_ongoing); 6388 mirror_num = stripe_index + 1; 6389 } 6390 6391 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6392 if (need_full_stripe(op)) { 6393 num_stripes = map->num_stripes; 6394 } else if (mirror_num) { 6395 stripe_index = mirror_num - 1; 6396 } else { 6397 mirror_num = 1; 6398 } 6399 6400 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6401 u32 factor = map->num_stripes / map->sub_stripes; 6402 6403 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6404 stripe_index *= map->sub_stripes; 6405 6406 if (need_full_stripe(op)) 6407 num_stripes = map->sub_stripes; 6408 else if (mirror_num) 6409 stripe_index += mirror_num - 1; 6410 else { 6411 int old_stripe_index = stripe_index; 6412 stripe_index = find_live_mirror(fs_info, map, 6413 stripe_index, 6414 dev_replace_is_ongoing); 6415 mirror_num = stripe_index - old_stripe_index + 1; 6416 } 6417 6418 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6419 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6420 /* push stripe_nr back to the start of the full stripe */ 6421 stripe_nr = div64_u64(raid56_full_stripe_start, 6422 stripe_len * data_stripes); 6423 6424 /* RAID[56] write or recovery. Return all stripes */ 6425 num_stripes = map->num_stripes; 6426 max_errors = nr_parity_stripes(map); 6427 6428 *length = map->stripe_len; 6429 stripe_index = 0; 6430 stripe_offset = 0; 6431 } else { 6432 /* 6433 * Mirror #0 or #1 means the original data block. 6434 * Mirror #2 is RAID5 parity block. 6435 * Mirror #3 is RAID6 Q block. 6436 */ 6437 stripe_nr = div_u64_rem(stripe_nr, 6438 data_stripes, &stripe_index); 6439 if (mirror_num > 1) 6440 stripe_index = data_stripes + mirror_num - 2; 6441 6442 /* We distribute the parity blocks across stripes */ 6443 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6444 &stripe_index); 6445 if (!need_full_stripe(op) && mirror_num <= 1) 6446 mirror_num = 1; 6447 } 6448 } else { 6449 /* 6450 * after this, stripe_nr is the number of stripes on this 6451 * device we have to walk to find the data, and stripe_index is 6452 * the number of our device in the stripe array 6453 */ 6454 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6455 &stripe_index); 6456 mirror_num = stripe_index + 1; 6457 } 6458 if (stripe_index >= map->num_stripes) { 6459 btrfs_crit(fs_info, 6460 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6461 stripe_index, map->num_stripes); 6462 ret = -EINVAL; 6463 goto out; 6464 } 6465 6466 num_alloc_stripes = num_stripes; 6467 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6468 if (op == BTRFS_MAP_WRITE) 6469 num_alloc_stripes <<= 1; 6470 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6471 num_alloc_stripes++; 6472 tgtdev_indexes = num_stripes; 6473 } 6474 6475 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6476 if (!bbio) { 6477 ret = -ENOMEM; 6478 goto out; 6479 } 6480 6481 for (i = 0; i < num_stripes; i++) { 6482 bbio->stripes[i].physical = map->stripes[stripe_index].physical + 6483 stripe_offset + stripe_nr * map->stripe_len; 6484 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 6485 stripe_index++; 6486 } 6487 6488 /* build raid_map */ 6489 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6490 (need_full_stripe(op) || mirror_num > 1)) { 6491 u64 tmp; 6492 unsigned rot; 6493 6494 /* Work out the disk rotation on this stripe-set */ 6495 div_u64_rem(stripe_nr, num_stripes, &rot); 6496 6497 /* Fill in the logical address of each stripe */ 6498 tmp = stripe_nr * data_stripes; 6499 for (i = 0; i < data_stripes; i++) 6500 bbio->raid_map[(i+rot) % num_stripes] = 6501 em->start + (tmp + i) * map->stripe_len; 6502 6503 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 6504 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6505 bbio->raid_map[(i+rot+1) % num_stripes] = 6506 RAID6_Q_STRIPE; 6507 6508 sort_parity_stripes(bbio, num_stripes); 6509 } 6510 6511 if (need_full_stripe(op)) 6512 max_errors = btrfs_chunk_max_errors(map); 6513 6514 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6515 need_full_stripe(op)) { 6516 handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, 6517 &num_stripes, &max_errors); 6518 } 6519 6520 *bbio_ret = bbio; 6521 bbio->map_type = map->type; 6522 bbio->num_stripes = num_stripes; 6523 bbio->max_errors = max_errors; 6524 bbio->mirror_num = mirror_num; 6525 6526 /* 6527 * this is the case that REQ_READ && dev_replace_is_ongoing && 6528 * mirror_num == num_stripes + 1 && dev_replace target drive is 6529 * available as a mirror 6530 */ 6531 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6532 WARN_ON(num_stripes > 1); 6533 bbio->stripes[0].dev = dev_replace->tgtdev; 6534 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6535 bbio->mirror_num = map->num_stripes + 1; 6536 } 6537 out: 6538 if (dev_replace_is_ongoing) { 6539 lockdep_assert_held(&dev_replace->rwsem); 6540 /* Unlock and let waiting writers proceed */ 6541 up_read(&dev_replace->rwsem); 6542 } 6543 free_extent_map(em); 6544 return ret; 6545 } 6546 6547 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6548 u64 logical, u64 *length, 6549 struct btrfs_bio **bbio_ret, int mirror_num) 6550 { 6551 if (op == BTRFS_MAP_DISCARD) 6552 return __btrfs_map_block_for_discard(fs_info, logical, 6553 length, bbio_ret); 6554 6555 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6556 mirror_num, 0); 6557 } 6558 6559 /* For Scrub/replace */ 6560 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6561 u64 logical, u64 *length, 6562 struct btrfs_bio **bbio_ret) 6563 { 6564 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6565 } 6566 6567 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6568 { 6569 bio->bi_private = bbio->private; 6570 bio->bi_end_io = bbio->end_io; 6571 bio_endio(bio); 6572 6573 btrfs_put_bbio(bbio); 6574 } 6575 6576 static void btrfs_end_bio(struct bio *bio) 6577 { 6578 struct btrfs_bio *bbio = bio->bi_private; 6579 int is_orig_bio = 0; 6580 6581 if (bio->bi_status) { 6582 atomic_inc(&bbio->error); 6583 if (bio->bi_status == BLK_STS_IOERR || 6584 bio->bi_status == BLK_STS_TARGET) { 6585 struct btrfs_device *dev = btrfs_io_bio(bio)->device; 6586 6587 ASSERT(dev->bdev); 6588 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6589 btrfs_dev_stat_inc_and_print(dev, 6590 BTRFS_DEV_STAT_WRITE_ERRS); 6591 else if (!(bio->bi_opf & REQ_RAHEAD)) 6592 btrfs_dev_stat_inc_and_print(dev, 6593 BTRFS_DEV_STAT_READ_ERRS); 6594 if (bio->bi_opf & REQ_PREFLUSH) 6595 btrfs_dev_stat_inc_and_print(dev, 6596 BTRFS_DEV_STAT_FLUSH_ERRS); 6597 } 6598 } 6599 6600 if (bio == bbio->orig_bio) 6601 is_orig_bio = 1; 6602 6603 btrfs_bio_counter_dec(bbio->fs_info); 6604 6605 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6606 if (!is_orig_bio) { 6607 bio_put(bio); 6608 bio = bbio->orig_bio; 6609 } 6610 6611 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6612 /* only send an error to the higher layers if it is 6613 * beyond the tolerance of the btrfs bio 6614 */ 6615 if (atomic_read(&bbio->error) > bbio->max_errors) { 6616 bio->bi_status = BLK_STS_IOERR; 6617 } else { 6618 /* 6619 * this bio is actually up to date, we didn't 6620 * go over the max number of errors 6621 */ 6622 bio->bi_status = BLK_STS_OK; 6623 } 6624 6625 btrfs_end_bbio(bbio, bio); 6626 } else if (!is_orig_bio) { 6627 bio_put(bio); 6628 } 6629 } 6630 6631 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6632 u64 physical, struct btrfs_device *dev) 6633 { 6634 struct btrfs_fs_info *fs_info = bbio->fs_info; 6635 6636 bio->bi_private = bbio; 6637 btrfs_io_bio(bio)->device = dev; 6638 bio->bi_end_io = btrfs_end_bio; 6639 bio->bi_iter.bi_sector = physical >> 9; 6640 /* 6641 * For zone append writing, bi_sector must point the beginning of the 6642 * zone 6643 */ 6644 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6645 if (btrfs_dev_is_sequential(dev, physical)) { 6646 u64 zone_start = round_down(physical, fs_info->zone_size); 6647 6648 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6649 } else { 6650 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6651 bio->bi_opf |= REQ_OP_WRITE; 6652 } 6653 } 6654 btrfs_debug_in_rcu(fs_info, 6655 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6656 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6657 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6658 dev->devid, bio->bi_iter.bi_size); 6659 bio_set_dev(bio, dev->bdev); 6660 6661 btrfs_bio_counter_inc_noblocked(fs_info); 6662 6663 btrfsic_submit_bio(bio); 6664 } 6665 6666 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6667 { 6668 atomic_inc(&bbio->error); 6669 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6670 /* Should be the original bio. */ 6671 WARN_ON(bio != bbio->orig_bio); 6672 6673 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6674 bio->bi_iter.bi_sector = logical >> 9; 6675 if (atomic_read(&bbio->error) > bbio->max_errors) 6676 bio->bi_status = BLK_STS_IOERR; 6677 else 6678 bio->bi_status = BLK_STS_OK; 6679 btrfs_end_bbio(bbio, bio); 6680 } 6681 } 6682 6683 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6684 int mirror_num) 6685 { 6686 struct btrfs_device *dev; 6687 struct bio *first_bio = bio; 6688 u64 logical = bio->bi_iter.bi_sector << 9; 6689 u64 length = 0; 6690 u64 map_length; 6691 int ret; 6692 int dev_nr; 6693 int total_devs; 6694 struct btrfs_bio *bbio = NULL; 6695 6696 length = bio->bi_iter.bi_size; 6697 map_length = length; 6698 6699 btrfs_bio_counter_inc_blocked(fs_info); 6700 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6701 &map_length, &bbio, mirror_num, 1); 6702 if (ret) { 6703 btrfs_bio_counter_dec(fs_info); 6704 return errno_to_blk_status(ret); 6705 } 6706 6707 total_devs = bbio->num_stripes; 6708 bbio->orig_bio = first_bio; 6709 bbio->private = first_bio->bi_private; 6710 bbio->end_io = first_bio->bi_end_io; 6711 bbio->fs_info = fs_info; 6712 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6713 6714 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6715 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6716 /* In this case, map_length has been set to the length of 6717 a single stripe; not the whole write */ 6718 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6719 ret = raid56_parity_write(fs_info, bio, bbio, 6720 map_length); 6721 } else { 6722 ret = raid56_parity_recover(fs_info, bio, bbio, 6723 map_length, mirror_num, 1); 6724 } 6725 6726 btrfs_bio_counter_dec(fs_info); 6727 return errno_to_blk_status(ret); 6728 } 6729 6730 if (map_length < length) { 6731 btrfs_crit(fs_info, 6732 "mapping failed logical %llu bio len %llu len %llu", 6733 logical, length, map_length); 6734 BUG(); 6735 } 6736 6737 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6738 dev = bbio->stripes[dev_nr].dev; 6739 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6740 &dev->dev_state) || 6741 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6742 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6743 bbio_error(bbio, first_bio, logical); 6744 continue; 6745 } 6746 6747 if (dev_nr < total_devs - 1) 6748 bio = btrfs_bio_clone(first_bio); 6749 else 6750 bio = first_bio; 6751 6752 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); 6753 } 6754 btrfs_bio_counter_dec(fs_info); 6755 return BLK_STS_OK; 6756 } 6757 6758 /* 6759 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6760 * return NULL. 6761 * 6762 * If devid and uuid are both specified, the match must be exact, otherwise 6763 * only devid is used. 6764 */ 6765 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6766 u64 devid, u8 *uuid, u8 *fsid) 6767 { 6768 struct btrfs_device *device; 6769 struct btrfs_fs_devices *seed_devs; 6770 6771 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6772 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6773 if (device->devid == devid && 6774 (!uuid || memcmp(device->uuid, uuid, 6775 BTRFS_UUID_SIZE) == 0)) 6776 return device; 6777 } 6778 } 6779 6780 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6781 if (!fsid || 6782 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6783 list_for_each_entry(device, &seed_devs->devices, 6784 dev_list) { 6785 if (device->devid == devid && 6786 (!uuid || memcmp(device->uuid, uuid, 6787 BTRFS_UUID_SIZE) == 0)) 6788 return device; 6789 } 6790 } 6791 } 6792 6793 return NULL; 6794 } 6795 6796 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6797 u64 devid, u8 *dev_uuid) 6798 { 6799 struct btrfs_device *device; 6800 unsigned int nofs_flag; 6801 6802 /* 6803 * We call this under the chunk_mutex, so we want to use NOFS for this 6804 * allocation, however we don't want to change btrfs_alloc_device() to 6805 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6806 * places. 6807 */ 6808 nofs_flag = memalloc_nofs_save(); 6809 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6810 memalloc_nofs_restore(nofs_flag); 6811 if (IS_ERR(device)) 6812 return device; 6813 6814 list_add(&device->dev_list, &fs_devices->devices); 6815 device->fs_devices = fs_devices; 6816 fs_devices->num_devices++; 6817 6818 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6819 fs_devices->missing_devices++; 6820 6821 return device; 6822 } 6823 6824 /** 6825 * btrfs_alloc_device - allocate struct btrfs_device 6826 * @fs_info: used only for generating a new devid, can be NULL if 6827 * devid is provided (i.e. @devid != NULL). 6828 * @devid: a pointer to devid for this device. If NULL a new devid 6829 * is generated. 6830 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6831 * is generated. 6832 * 6833 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6834 * on error. Returned struct is not linked onto any lists and must be 6835 * destroyed with btrfs_free_device. 6836 */ 6837 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6838 const u64 *devid, 6839 const u8 *uuid) 6840 { 6841 struct btrfs_device *dev; 6842 u64 tmp; 6843 6844 if (WARN_ON(!devid && !fs_info)) 6845 return ERR_PTR(-EINVAL); 6846 6847 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6848 if (!dev) 6849 return ERR_PTR(-ENOMEM); 6850 6851 /* 6852 * Preallocate a bio that's always going to be used for flushing device 6853 * barriers and matches the device lifespan 6854 */ 6855 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6856 if (!dev->flush_bio) { 6857 kfree(dev); 6858 return ERR_PTR(-ENOMEM); 6859 } 6860 6861 INIT_LIST_HEAD(&dev->dev_list); 6862 INIT_LIST_HEAD(&dev->dev_alloc_list); 6863 INIT_LIST_HEAD(&dev->post_commit_list); 6864 6865 atomic_set(&dev->reada_in_flight, 0); 6866 atomic_set(&dev->dev_stats_ccnt, 0); 6867 btrfs_device_data_ordered_init(dev); 6868 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6869 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6870 extent_io_tree_init(fs_info, &dev->alloc_state, 6871 IO_TREE_DEVICE_ALLOC_STATE, NULL); 6872 6873 if (devid) 6874 tmp = *devid; 6875 else { 6876 int ret; 6877 6878 ret = find_next_devid(fs_info, &tmp); 6879 if (ret) { 6880 btrfs_free_device(dev); 6881 return ERR_PTR(ret); 6882 } 6883 } 6884 dev->devid = tmp; 6885 6886 if (uuid) 6887 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6888 else 6889 generate_random_uuid(dev->uuid); 6890 6891 return dev; 6892 } 6893 6894 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6895 u64 devid, u8 *uuid, bool error) 6896 { 6897 if (error) 6898 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6899 devid, uuid); 6900 else 6901 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6902 devid, uuid); 6903 } 6904 6905 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 6906 { 6907 const int data_stripes = calc_data_stripes(type, num_stripes); 6908 6909 return div_u64(chunk_len, data_stripes); 6910 } 6911 6912 #if BITS_PER_LONG == 32 6913 /* 6914 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6915 * can't be accessed on 32bit systems. 6916 * 6917 * This function do mount time check to reject the fs if it already has 6918 * metadata chunk beyond that limit. 6919 */ 6920 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6921 u64 logical, u64 length, u64 type) 6922 { 6923 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6924 return 0; 6925 6926 if (logical + length < MAX_LFS_FILESIZE) 6927 return 0; 6928 6929 btrfs_err_32bit_limit(fs_info); 6930 return -EOVERFLOW; 6931 } 6932 6933 /* 6934 * This is to give early warning for any metadata chunk reaching 6935 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 6936 * Although we can still access the metadata, it's not going to be possible 6937 * once the limit is reached. 6938 */ 6939 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6940 u64 logical, u64 length, u64 type) 6941 { 6942 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6943 return; 6944 6945 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6946 return; 6947 6948 btrfs_warn_32bit_limit(fs_info); 6949 } 6950 #endif 6951 6952 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 6953 struct btrfs_chunk *chunk) 6954 { 6955 struct btrfs_fs_info *fs_info = leaf->fs_info; 6956 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6957 struct map_lookup *map; 6958 struct extent_map *em; 6959 u64 logical; 6960 u64 length; 6961 u64 devid; 6962 u64 type; 6963 u8 uuid[BTRFS_UUID_SIZE]; 6964 int num_stripes; 6965 int ret; 6966 int i; 6967 6968 logical = key->offset; 6969 length = btrfs_chunk_length(leaf, chunk); 6970 type = btrfs_chunk_type(leaf, chunk); 6971 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6972 6973 #if BITS_PER_LONG == 32 6974 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 6975 if (ret < 0) 6976 return ret; 6977 warn_32bit_meta_chunk(fs_info, logical, length, type); 6978 #endif 6979 6980 /* 6981 * Only need to verify chunk item if we're reading from sys chunk array, 6982 * as chunk item in tree block is already verified by tree-checker. 6983 */ 6984 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6985 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6986 if (ret) 6987 return ret; 6988 } 6989 6990 read_lock(&map_tree->lock); 6991 em = lookup_extent_mapping(map_tree, logical, 1); 6992 read_unlock(&map_tree->lock); 6993 6994 /* already mapped? */ 6995 if (em && em->start <= logical && em->start + em->len > logical) { 6996 free_extent_map(em); 6997 return 0; 6998 } else if (em) { 6999 free_extent_map(em); 7000 } 7001 7002 em = alloc_extent_map(); 7003 if (!em) 7004 return -ENOMEM; 7005 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7006 if (!map) { 7007 free_extent_map(em); 7008 return -ENOMEM; 7009 } 7010 7011 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7012 em->map_lookup = map; 7013 em->start = logical; 7014 em->len = length; 7015 em->orig_start = 0; 7016 em->block_start = 0; 7017 em->block_len = em->len; 7018 7019 map->num_stripes = num_stripes; 7020 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7021 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7022 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7023 map->type = type; 7024 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7025 map->verified_stripes = 0; 7026 em->orig_block_len = calc_stripe_length(type, em->len, 7027 map->num_stripes); 7028 for (i = 0; i < num_stripes; i++) { 7029 map->stripes[i].physical = 7030 btrfs_stripe_offset_nr(leaf, chunk, i); 7031 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7032 read_extent_buffer(leaf, uuid, (unsigned long) 7033 btrfs_stripe_dev_uuid_nr(chunk, i), 7034 BTRFS_UUID_SIZE); 7035 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 7036 devid, uuid, NULL); 7037 if (!map->stripes[i].dev && 7038 !btrfs_test_opt(fs_info, DEGRADED)) { 7039 free_extent_map(em); 7040 btrfs_report_missing_device(fs_info, devid, uuid, true); 7041 return -ENOENT; 7042 } 7043 if (!map->stripes[i].dev) { 7044 map->stripes[i].dev = 7045 add_missing_dev(fs_info->fs_devices, devid, 7046 uuid); 7047 if (IS_ERR(map->stripes[i].dev)) { 7048 free_extent_map(em); 7049 btrfs_err(fs_info, 7050 "failed to init missing dev %llu: %ld", 7051 devid, PTR_ERR(map->stripes[i].dev)); 7052 return PTR_ERR(map->stripes[i].dev); 7053 } 7054 btrfs_report_missing_device(fs_info, devid, uuid, false); 7055 } 7056 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7057 &(map->stripes[i].dev->dev_state)); 7058 7059 } 7060 7061 write_lock(&map_tree->lock); 7062 ret = add_extent_mapping(map_tree, em, 0); 7063 write_unlock(&map_tree->lock); 7064 if (ret < 0) { 7065 btrfs_err(fs_info, 7066 "failed to add chunk map, start=%llu len=%llu: %d", 7067 em->start, em->len, ret); 7068 } 7069 free_extent_map(em); 7070 7071 return ret; 7072 } 7073 7074 static void fill_device_from_item(struct extent_buffer *leaf, 7075 struct btrfs_dev_item *dev_item, 7076 struct btrfs_device *device) 7077 { 7078 unsigned long ptr; 7079 7080 device->devid = btrfs_device_id(leaf, dev_item); 7081 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7082 device->total_bytes = device->disk_total_bytes; 7083 device->commit_total_bytes = device->disk_total_bytes; 7084 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7085 device->commit_bytes_used = device->bytes_used; 7086 device->type = btrfs_device_type(leaf, dev_item); 7087 device->io_align = btrfs_device_io_align(leaf, dev_item); 7088 device->io_width = btrfs_device_io_width(leaf, dev_item); 7089 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7090 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7091 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7092 7093 ptr = btrfs_device_uuid(dev_item); 7094 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7095 } 7096 7097 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7098 u8 *fsid) 7099 { 7100 struct btrfs_fs_devices *fs_devices; 7101 int ret; 7102 7103 lockdep_assert_held(&uuid_mutex); 7104 ASSERT(fsid); 7105 7106 /* This will match only for multi-device seed fs */ 7107 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7108 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7109 return fs_devices; 7110 7111 7112 fs_devices = find_fsid(fsid, NULL); 7113 if (!fs_devices) { 7114 if (!btrfs_test_opt(fs_info, DEGRADED)) 7115 return ERR_PTR(-ENOENT); 7116 7117 fs_devices = alloc_fs_devices(fsid, NULL); 7118 if (IS_ERR(fs_devices)) 7119 return fs_devices; 7120 7121 fs_devices->seeding = true; 7122 fs_devices->opened = 1; 7123 return fs_devices; 7124 } 7125 7126 /* 7127 * Upon first call for a seed fs fsid, just create a private copy of the 7128 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7129 */ 7130 fs_devices = clone_fs_devices(fs_devices); 7131 if (IS_ERR(fs_devices)) 7132 return fs_devices; 7133 7134 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7135 if (ret) { 7136 free_fs_devices(fs_devices); 7137 return ERR_PTR(ret); 7138 } 7139 7140 if (!fs_devices->seeding) { 7141 close_fs_devices(fs_devices); 7142 free_fs_devices(fs_devices); 7143 return ERR_PTR(-EINVAL); 7144 } 7145 7146 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7147 7148 return fs_devices; 7149 } 7150 7151 static int read_one_dev(struct extent_buffer *leaf, 7152 struct btrfs_dev_item *dev_item) 7153 { 7154 struct btrfs_fs_info *fs_info = leaf->fs_info; 7155 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7156 struct btrfs_device *device; 7157 u64 devid; 7158 int ret; 7159 u8 fs_uuid[BTRFS_FSID_SIZE]; 7160 u8 dev_uuid[BTRFS_UUID_SIZE]; 7161 7162 devid = btrfs_device_id(leaf, dev_item); 7163 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7164 BTRFS_UUID_SIZE); 7165 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7166 BTRFS_FSID_SIZE); 7167 7168 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7169 fs_devices = open_seed_devices(fs_info, fs_uuid); 7170 if (IS_ERR(fs_devices)) 7171 return PTR_ERR(fs_devices); 7172 } 7173 7174 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 7175 fs_uuid); 7176 if (!device) { 7177 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7178 btrfs_report_missing_device(fs_info, devid, 7179 dev_uuid, true); 7180 return -ENOENT; 7181 } 7182 7183 device = add_missing_dev(fs_devices, devid, dev_uuid); 7184 if (IS_ERR(device)) { 7185 btrfs_err(fs_info, 7186 "failed to add missing dev %llu: %ld", 7187 devid, PTR_ERR(device)); 7188 return PTR_ERR(device); 7189 } 7190 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7191 } else { 7192 if (!device->bdev) { 7193 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7194 btrfs_report_missing_device(fs_info, 7195 devid, dev_uuid, true); 7196 return -ENOENT; 7197 } 7198 btrfs_report_missing_device(fs_info, devid, 7199 dev_uuid, false); 7200 } 7201 7202 if (!device->bdev && 7203 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7204 /* 7205 * this happens when a device that was properly setup 7206 * in the device info lists suddenly goes bad. 7207 * device->bdev is NULL, and so we have to set 7208 * device->missing to one here 7209 */ 7210 device->fs_devices->missing_devices++; 7211 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7212 } 7213 7214 /* Move the device to its own fs_devices */ 7215 if (device->fs_devices != fs_devices) { 7216 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7217 &device->dev_state)); 7218 7219 list_move(&device->dev_list, &fs_devices->devices); 7220 device->fs_devices->num_devices--; 7221 fs_devices->num_devices++; 7222 7223 device->fs_devices->missing_devices--; 7224 fs_devices->missing_devices++; 7225 7226 device->fs_devices = fs_devices; 7227 } 7228 } 7229 7230 if (device->fs_devices != fs_info->fs_devices) { 7231 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7232 if (device->generation != 7233 btrfs_device_generation(leaf, dev_item)) 7234 return -EINVAL; 7235 } 7236 7237 fill_device_from_item(leaf, dev_item, device); 7238 if (device->bdev) { 7239 u64 max_total_bytes = i_size_read(device->bdev->bd_inode); 7240 7241 if (device->total_bytes > max_total_bytes) { 7242 btrfs_err(fs_info, 7243 "device total_bytes should be at most %llu but found %llu", 7244 max_total_bytes, device->total_bytes); 7245 return -EINVAL; 7246 } 7247 } 7248 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7249 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7250 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7251 device->fs_devices->total_rw_bytes += device->total_bytes; 7252 atomic64_add(device->total_bytes - device->bytes_used, 7253 &fs_info->free_chunk_space); 7254 } 7255 ret = 0; 7256 return ret; 7257 } 7258 7259 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7260 { 7261 struct btrfs_root *root = fs_info->tree_root; 7262 struct btrfs_super_block *super_copy = fs_info->super_copy; 7263 struct extent_buffer *sb; 7264 struct btrfs_disk_key *disk_key; 7265 struct btrfs_chunk *chunk; 7266 u8 *array_ptr; 7267 unsigned long sb_array_offset; 7268 int ret = 0; 7269 u32 num_stripes; 7270 u32 array_size; 7271 u32 len = 0; 7272 u32 cur_offset; 7273 u64 type; 7274 struct btrfs_key key; 7275 7276 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7277 /* 7278 * This will create extent buffer of nodesize, superblock size is 7279 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7280 * overallocate but we can keep it as-is, only the first page is used. 7281 */ 7282 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7283 root->root_key.objectid, 0); 7284 if (IS_ERR(sb)) 7285 return PTR_ERR(sb); 7286 set_extent_buffer_uptodate(sb); 7287 /* 7288 * The sb extent buffer is artificial and just used to read the system array. 7289 * set_extent_buffer_uptodate() call does not properly mark all it's 7290 * pages up-to-date when the page is larger: extent does not cover the 7291 * whole page and consequently check_page_uptodate does not find all 7292 * the page's extents up-to-date (the hole beyond sb), 7293 * write_extent_buffer then triggers a WARN_ON. 7294 * 7295 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7296 * but sb spans only this function. Add an explicit SetPageUptodate call 7297 * to silence the warning eg. on PowerPC 64. 7298 */ 7299 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7300 SetPageUptodate(sb->pages[0]); 7301 7302 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7303 array_size = btrfs_super_sys_array_size(super_copy); 7304 7305 array_ptr = super_copy->sys_chunk_array; 7306 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7307 cur_offset = 0; 7308 7309 while (cur_offset < array_size) { 7310 disk_key = (struct btrfs_disk_key *)array_ptr; 7311 len = sizeof(*disk_key); 7312 if (cur_offset + len > array_size) 7313 goto out_short_read; 7314 7315 btrfs_disk_key_to_cpu(&key, disk_key); 7316 7317 array_ptr += len; 7318 sb_array_offset += len; 7319 cur_offset += len; 7320 7321 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7322 btrfs_err(fs_info, 7323 "unexpected item type %u in sys_array at offset %u", 7324 (u32)key.type, cur_offset); 7325 ret = -EIO; 7326 break; 7327 } 7328 7329 chunk = (struct btrfs_chunk *)sb_array_offset; 7330 /* 7331 * At least one btrfs_chunk with one stripe must be present, 7332 * exact stripe count check comes afterwards 7333 */ 7334 len = btrfs_chunk_item_size(1); 7335 if (cur_offset + len > array_size) 7336 goto out_short_read; 7337 7338 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7339 if (!num_stripes) { 7340 btrfs_err(fs_info, 7341 "invalid number of stripes %u in sys_array at offset %u", 7342 num_stripes, cur_offset); 7343 ret = -EIO; 7344 break; 7345 } 7346 7347 type = btrfs_chunk_type(sb, chunk); 7348 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7349 btrfs_err(fs_info, 7350 "invalid chunk type %llu in sys_array at offset %u", 7351 type, cur_offset); 7352 ret = -EIO; 7353 break; 7354 } 7355 7356 len = btrfs_chunk_item_size(num_stripes); 7357 if (cur_offset + len > array_size) 7358 goto out_short_read; 7359 7360 ret = read_one_chunk(&key, sb, chunk); 7361 if (ret) 7362 break; 7363 7364 array_ptr += len; 7365 sb_array_offset += len; 7366 cur_offset += len; 7367 } 7368 clear_extent_buffer_uptodate(sb); 7369 free_extent_buffer_stale(sb); 7370 return ret; 7371 7372 out_short_read: 7373 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7374 len, cur_offset); 7375 clear_extent_buffer_uptodate(sb); 7376 free_extent_buffer_stale(sb); 7377 return -EIO; 7378 } 7379 7380 /* 7381 * Check if all chunks in the fs are OK for read-write degraded mount 7382 * 7383 * If the @failing_dev is specified, it's accounted as missing. 7384 * 7385 * Return true if all chunks meet the minimal RW mount requirements. 7386 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7387 */ 7388 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7389 struct btrfs_device *failing_dev) 7390 { 7391 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7392 struct extent_map *em; 7393 u64 next_start = 0; 7394 bool ret = true; 7395 7396 read_lock(&map_tree->lock); 7397 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7398 read_unlock(&map_tree->lock); 7399 /* No chunk at all? Return false anyway */ 7400 if (!em) { 7401 ret = false; 7402 goto out; 7403 } 7404 while (em) { 7405 struct map_lookup *map; 7406 int missing = 0; 7407 int max_tolerated; 7408 int i; 7409 7410 map = em->map_lookup; 7411 max_tolerated = 7412 btrfs_get_num_tolerated_disk_barrier_failures( 7413 map->type); 7414 for (i = 0; i < map->num_stripes; i++) { 7415 struct btrfs_device *dev = map->stripes[i].dev; 7416 7417 if (!dev || !dev->bdev || 7418 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7419 dev->last_flush_error) 7420 missing++; 7421 else if (failing_dev && failing_dev == dev) 7422 missing++; 7423 } 7424 if (missing > max_tolerated) { 7425 if (!failing_dev) 7426 btrfs_warn(fs_info, 7427 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7428 em->start, missing, max_tolerated); 7429 free_extent_map(em); 7430 ret = false; 7431 goto out; 7432 } 7433 next_start = extent_map_end(em); 7434 free_extent_map(em); 7435 7436 read_lock(&map_tree->lock); 7437 em = lookup_extent_mapping(map_tree, next_start, 7438 (u64)(-1) - next_start); 7439 read_unlock(&map_tree->lock); 7440 } 7441 out: 7442 return ret; 7443 } 7444 7445 static void readahead_tree_node_children(struct extent_buffer *node) 7446 { 7447 int i; 7448 const int nr_items = btrfs_header_nritems(node); 7449 7450 for (i = 0; i < nr_items; i++) 7451 btrfs_readahead_node_child(node, i); 7452 } 7453 7454 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7455 { 7456 struct btrfs_root *root = fs_info->chunk_root; 7457 struct btrfs_path *path; 7458 struct extent_buffer *leaf; 7459 struct btrfs_key key; 7460 struct btrfs_key found_key; 7461 int ret; 7462 int slot; 7463 u64 total_dev = 0; 7464 u64 last_ra_node = 0; 7465 7466 path = btrfs_alloc_path(); 7467 if (!path) 7468 return -ENOMEM; 7469 7470 /* 7471 * uuid_mutex is needed only if we are mounting a sprout FS 7472 * otherwise we don't need it. 7473 */ 7474 mutex_lock(&uuid_mutex); 7475 7476 /* 7477 * It is possible for mount and umount to race in such a way that 7478 * we execute this code path, but open_fs_devices failed to clear 7479 * total_rw_bytes. We certainly want it cleared before reading the 7480 * device items, so clear it here. 7481 */ 7482 fs_info->fs_devices->total_rw_bytes = 0; 7483 7484 /* 7485 * Read all device items, and then all the chunk items. All 7486 * device items are found before any chunk item (their object id 7487 * is smaller than the lowest possible object id for a chunk 7488 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7489 */ 7490 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7491 key.offset = 0; 7492 key.type = 0; 7493 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7494 if (ret < 0) 7495 goto error; 7496 while (1) { 7497 struct extent_buffer *node; 7498 7499 leaf = path->nodes[0]; 7500 slot = path->slots[0]; 7501 if (slot >= btrfs_header_nritems(leaf)) { 7502 ret = btrfs_next_leaf(root, path); 7503 if (ret == 0) 7504 continue; 7505 if (ret < 0) 7506 goto error; 7507 break; 7508 } 7509 /* 7510 * The nodes on level 1 are not locked but we don't need to do 7511 * that during mount time as nothing else can access the tree 7512 */ 7513 node = path->nodes[1]; 7514 if (node) { 7515 if (last_ra_node != node->start) { 7516 readahead_tree_node_children(node); 7517 last_ra_node = node->start; 7518 } 7519 } 7520 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7521 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7522 struct btrfs_dev_item *dev_item; 7523 dev_item = btrfs_item_ptr(leaf, slot, 7524 struct btrfs_dev_item); 7525 ret = read_one_dev(leaf, dev_item); 7526 if (ret) 7527 goto error; 7528 total_dev++; 7529 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7530 struct btrfs_chunk *chunk; 7531 7532 /* 7533 * We are only called at mount time, so no need to take 7534 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7535 * we always lock first fs_info->chunk_mutex before 7536 * acquiring any locks on the chunk tree. This is a 7537 * requirement for chunk allocation, see the comment on 7538 * top of btrfs_chunk_alloc() for details. 7539 */ 7540 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7541 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7542 ret = read_one_chunk(&found_key, leaf, chunk); 7543 if (ret) 7544 goto error; 7545 } 7546 path->slots[0]++; 7547 } 7548 7549 /* 7550 * After loading chunk tree, we've got all device information, 7551 * do another round of validation checks. 7552 */ 7553 if (total_dev != fs_info->fs_devices->total_devices) { 7554 btrfs_err(fs_info, 7555 "super_num_devices %llu mismatch with num_devices %llu found here", 7556 btrfs_super_num_devices(fs_info->super_copy), 7557 total_dev); 7558 ret = -EINVAL; 7559 goto error; 7560 } 7561 if (btrfs_super_total_bytes(fs_info->super_copy) < 7562 fs_info->fs_devices->total_rw_bytes) { 7563 btrfs_err(fs_info, 7564 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7565 btrfs_super_total_bytes(fs_info->super_copy), 7566 fs_info->fs_devices->total_rw_bytes); 7567 ret = -EINVAL; 7568 goto error; 7569 } 7570 ret = 0; 7571 error: 7572 mutex_unlock(&uuid_mutex); 7573 7574 btrfs_free_path(path); 7575 return ret; 7576 } 7577 7578 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7579 { 7580 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7581 struct btrfs_device *device; 7582 7583 fs_devices->fs_info = fs_info; 7584 7585 mutex_lock(&fs_devices->device_list_mutex); 7586 list_for_each_entry(device, &fs_devices->devices, dev_list) 7587 device->fs_info = fs_info; 7588 7589 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7590 list_for_each_entry(device, &seed_devs->devices, dev_list) 7591 device->fs_info = fs_info; 7592 7593 seed_devs->fs_info = fs_info; 7594 } 7595 mutex_unlock(&fs_devices->device_list_mutex); 7596 } 7597 7598 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7599 const struct btrfs_dev_stats_item *ptr, 7600 int index) 7601 { 7602 u64 val; 7603 7604 read_extent_buffer(eb, &val, 7605 offsetof(struct btrfs_dev_stats_item, values) + 7606 ((unsigned long)ptr) + (index * sizeof(u64)), 7607 sizeof(val)); 7608 return val; 7609 } 7610 7611 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7612 struct btrfs_dev_stats_item *ptr, 7613 int index, u64 val) 7614 { 7615 write_extent_buffer(eb, &val, 7616 offsetof(struct btrfs_dev_stats_item, values) + 7617 ((unsigned long)ptr) + (index * sizeof(u64)), 7618 sizeof(val)); 7619 } 7620 7621 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7622 struct btrfs_path *path) 7623 { 7624 struct btrfs_dev_stats_item *ptr; 7625 struct extent_buffer *eb; 7626 struct btrfs_key key; 7627 int item_size; 7628 int i, ret, slot; 7629 7630 if (!device->fs_info->dev_root) 7631 return 0; 7632 7633 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7634 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7635 key.offset = device->devid; 7636 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7637 if (ret) { 7638 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7639 btrfs_dev_stat_set(device, i, 0); 7640 device->dev_stats_valid = 1; 7641 btrfs_release_path(path); 7642 return ret < 0 ? ret : 0; 7643 } 7644 slot = path->slots[0]; 7645 eb = path->nodes[0]; 7646 item_size = btrfs_item_size_nr(eb, slot); 7647 7648 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7649 7650 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7651 if (item_size >= (1 + i) * sizeof(__le64)) 7652 btrfs_dev_stat_set(device, i, 7653 btrfs_dev_stats_value(eb, ptr, i)); 7654 else 7655 btrfs_dev_stat_set(device, i, 0); 7656 } 7657 7658 device->dev_stats_valid = 1; 7659 btrfs_dev_stat_print_on_load(device); 7660 btrfs_release_path(path); 7661 7662 return 0; 7663 } 7664 7665 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7666 { 7667 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7668 struct btrfs_device *device; 7669 struct btrfs_path *path = NULL; 7670 int ret = 0; 7671 7672 path = btrfs_alloc_path(); 7673 if (!path) 7674 return -ENOMEM; 7675 7676 mutex_lock(&fs_devices->device_list_mutex); 7677 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7678 ret = btrfs_device_init_dev_stats(device, path); 7679 if (ret) 7680 goto out; 7681 } 7682 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7683 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7684 ret = btrfs_device_init_dev_stats(device, path); 7685 if (ret) 7686 goto out; 7687 } 7688 } 7689 out: 7690 mutex_unlock(&fs_devices->device_list_mutex); 7691 7692 btrfs_free_path(path); 7693 return ret; 7694 } 7695 7696 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7697 struct btrfs_device *device) 7698 { 7699 struct btrfs_fs_info *fs_info = trans->fs_info; 7700 struct btrfs_root *dev_root = fs_info->dev_root; 7701 struct btrfs_path *path; 7702 struct btrfs_key key; 7703 struct extent_buffer *eb; 7704 struct btrfs_dev_stats_item *ptr; 7705 int ret; 7706 int i; 7707 7708 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7709 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7710 key.offset = device->devid; 7711 7712 path = btrfs_alloc_path(); 7713 if (!path) 7714 return -ENOMEM; 7715 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7716 if (ret < 0) { 7717 btrfs_warn_in_rcu(fs_info, 7718 "error %d while searching for dev_stats item for device %s", 7719 ret, rcu_str_deref(device->name)); 7720 goto out; 7721 } 7722 7723 if (ret == 0 && 7724 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7725 /* need to delete old one and insert a new one */ 7726 ret = btrfs_del_item(trans, dev_root, path); 7727 if (ret != 0) { 7728 btrfs_warn_in_rcu(fs_info, 7729 "delete too small dev_stats item for device %s failed %d", 7730 rcu_str_deref(device->name), ret); 7731 goto out; 7732 } 7733 ret = 1; 7734 } 7735 7736 if (ret == 1) { 7737 /* need to insert a new item */ 7738 btrfs_release_path(path); 7739 ret = btrfs_insert_empty_item(trans, dev_root, path, 7740 &key, sizeof(*ptr)); 7741 if (ret < 0) { 7742 btrfs_warn_in_rcu(fs_info, 7743 "insert dev_stats item for device %s failed %d", 7744 rcu_str_deref(device->name), ret); 7745 goto out; 7746 } 7747 } 7748 7749 eb = path->nodes[0]; 7750 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7751 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7752 btrfs_set_dev_stats_value(eb, ptr, i, 7753 btrfs_dev_stat_read(device, i)); 7754 btrfs_mark_buffer_dirty(eb); 7755 7756 out: 7757 btrfs_free_path(path); 7758 return ret; 7759 } 7760 7761 /* 7762 * called from commit_transaction. Writes all changed device stats to disk. 7763 */ 7764 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7765 { 7766 struct btrfs_fs_info *fs_info = trans->fs_info; 7767 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7768 struct btrfs_device *device; 7769 int stats_cnt; 7770 int ret = 0; 7771 7772 mutex_lock(&fs_devices->device_list_mutex); 7773 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7774 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7775 if (!device->dev_stats_valid || stats_cnt == 0) 7776 continue; 7777 7778 7779 /* 7780 * There is a LOAD-LOAD control dependency between the value of 7781 * dev_stats_ccnt and updating the on-disk values which requires 7782 * reading the in-memory counters. Such control dependencies 7783 * require explicit read memory barriers. 7784 * 7785 * This memory barriers pairs with smp_mb__before_atomic in 7786 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7787 * barrier implied by atomic_xchg in 7788 * btrfs_dev_stats_read_and_reset 7789 */ 7790 smp_rmb(); 7791 7792 ret = update_dev_stat_item(trans, device); 7793 if (!ret) 7794 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7795 } 7796 mutex_unlock(&fs_devices->device_list_mutex); 7797 7798 return ret; 7799 } 7800 7801 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7802 { 7803 btrfs_dev_stat_inc(dev, index); 7804 btrfs_dev_stat_print_on_error(dev); 7805 } 7806 7807 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7808 { 7809 if (!dev->dev_stats_valid) 7810 return; 7811 btrfs_err_rl_in_rcu(dev->fs_info, 7812 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7813 rcu_str_deref(dev->name), 7814 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7815 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7816 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7817 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7818 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7819 } 7820 7821 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7822 { 7823 int i; 7824 7825 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7826 if (btrfs_dev_stat_read(dev, i) != 0) 7827 break; 7828 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7829 return; /* all values == 0, suppress message */ 7830 7831 btrfs_info_in_rcu(dev->fs_info, 7832 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7833 rcu_str_deref(dev->name), 7834 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7835 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7836 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7837 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7838 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7839 } 7840 7841 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7842 struct btrfs_ioctl_get_dev_stats *stats) 7843 { 7844 struct btrfs_device *dev; 7845 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7846 int i; 7847 7848 mutex_lock(&fs_devices->device_list_mutex); 7849 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); 7850 mutex_unlock(&fs_devices->device_list_mutex); 7851 7852 if (!dev) { 7853 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7854 return -ENODEV; 7855 } else if (!dev->dev_stats_valid) { 7856 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7857 return -ENODEV; 7858 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7859 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7860 if (stats->nr_items > i) 7861 stats->values[i] = 7862 btrfs_dev_stat_read_and_reset(dev, i); 7863 else 7864 btrfs_dev_stat_set(dev, i, 0); 7865 } 7866 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7867 current->comm, task_pid_nr(current)); 7868 } else { 7869 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7870 if (stats->nr_items > i) 7871 stats->values[i] = btrfs_dev_stat_read(dev, i); 7872 } 7873 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7874 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7875 return 0; 7876 } 7877 7878 /* 7879 * Update the size and bytes used for each device where it changed. This is 7880 * delayed since we would otherwise get errors while writing out the 7881 * superblocks. 7882 * 7883 * Must be invoked during transaction commit. 7884 */ 7885 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7886 { 7887 struct btrfs_device *curr, *next; 7888 7889 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7890 7891 if (list_empty(&trans->dev_update_list)) 7892 return; 7893 7894 /* 7895 * We don't need the device_list_mutex here. This list is owned by the 7896 * transaction and the transaction must complete before the device is 7897 * released. 7898 */ 7899 mutex_lock(&trans->fs_info->chunk_mutex); 7900 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7901 post_commit_list) { 7902 list_del_init(&curr->post_commit_list); 7903 curr->commit_total_bytes = curr->disk_total_bytes; 7904 curr->commit_bytes_used = curr->bytes_used; 7905 } 7906 mutex_unlock(&trans->fs_info->chunk_mutex); 7907 } 7908 7909 /* 7910 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7911 */ 7912 int btrfs_bg_type_to_factor(u64 flags) 7913 { 7914 const int index = btrfs_bg_flags_to_raid_index(flags); 7915 7916 return btrfs_raid_array[index].ncopies; 7917 } 7918 7919 7920 7921 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7922 u64 chunk_offset, u64 devid, 7923 u64 physical_offset, u64 physical_len) 7924 { 7925 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7926 struct extent_map *em; 7927 struct map_lookup *map; 7928 struct btrfs_device *dev; 7929 u64 stripe_len; 7930 bool found = false; 7931 int ret = 0; 7932 int i; 7933 7934 read_lock(&em_tree->lock); 7935 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7936 read_unlock(&em_tree->lock); 7937 7938 if (!em) { 7939 btrfs_err(fs_info, 7940 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7941 physical_offset, devid); 7942 ret = -EUCLEAN; 7943 goto out; 7944 } 7945 7946 map = em->map_lookup; 7947 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7948 if (physical_len != stripe_len) { 7949 btrfs_err(fs_info, 7950 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7951 physical_offset, devid, em->start, physical_len, 7952 stripe_len); 7953 ret = -EUCLEAN; 7954 goto out; 7955 } 7956 7957 for (i = 0; i < map->num_stripes; i++) { 7958 if (map->stripes[i].dev->devid == devid && 7959 map->stripes[i].physical == physical_offset) { 7960 found = true; 7961 if (map->verified_stripes >= map->num_stripes) { 7962 btrfs_err(fs_info, 7963 "too many dev extents for chunk %llu found", 7964 em->start); 7965 ret = -EUCLEAN; 7966 goto out; 7967 } 7968 map->verified_stripes++; 7969 break; 7970 } 7971 } 7972 if (!found) { 7973 btrfs_err(fs_info, 7974 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7975 physical_offset, devid); 7976 ret = -EUCLEAN; 7977 } 7978 7979 /* Make sure no dev extent is beyond device boundary */ 7980 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); 7981 if (!dev) { 7982 btrfs_err(fs_info, "failed to find devid %llu", devid); 7983 ret = -EUCLEAN; 7984 goto out; 7985 } 7986 7987 if (physical_offset + physical_len > dev->disk_total_bytes) { 7988 btrfs_err(fs_info, 7989 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7990 devid, physical_offset, physical_len, 7991 dev->disk_total_bytes); 7992 ret = -EUCLEAN; 7993 goto out; 7994 } 7995 7996 if (dev->zone_info) { 7997 u64 zone_size = dev->zone_info->zone_size; 7998 7999 if (!IS_ALIGNED(physical_offset, zone_size) || 8000 !IS_ALIGNED(physical_len, zone_size)) { 8001 btrfs_err(fs_info, 8002 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8003 devid, physical_offset, physical_len); 8004 ret = -EUCLEAN; 8005 goto out; 8006 } 8007 } 8008 8009 out: 8010 free_extent_map(em); 8011 return ret; 8012 } 8013 8014 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8015 { 8016 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8017 struct extent_map *em; 8018 struct rb_node *node; 8019 int ret = 0; 8020 8021 read_lock(&em_tree->lock); 8022 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8023 em = rb_entry(node, struct extent_map, rb_node); 8024 if (em->map_lookup->num_stripes != 8025 em->map_lookup->verified_stripes) { 8026 btrfs_err(fs_info, 8027 "chunk %llu has missing dev extent, have %d expect %d", 8028 em->start, em->map_lookup->verified_stripes, 8029 em->map_lookup->num_stripes); 8030 ret = -EUCLEAN; 8031 goto out; 8032 } 8033 } 8034 out: 8035 read_unlock(&em_tree->lock); 8036 return ret; 8037 } 8038 8039 /* 8040 * Ensure that all dev extents are mapped to correct chunk, otherwise 8041 * later chunk allocation/free would cause unexpected behavior. 8042 * 8043 * NOTE: This will iterate through the whole device tree, which should be of 8044 * the same size level as the chunk tree. This slightly increases mount time. 8045 */ 8046 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8047 { 8048 struct btrfs_path *path; 8049 struct btrfs_root *root = fs_info->dev_root; 8050 struct btrfs_key key; 8051 u64 prev_devid = 0; 8052 u64 prev_dev_ext_end = 0; 8053 int ret = 0; 8054 8055 /* 8056 * We don't have a dev_root because we mounted with ignorebadroots and 8057 * failed to load the root, so we want to skip the verification in this 8058 * case for sure. 8059 * 8060 * However if the dev root is fine, but the tree itself is corrupted 8061 * we'd still fail to mount. This verification is only to make sure 8062 * writes can happen safely, so instead just bypass this check 8063 * completely in the case of IGNOREBADROOTS. 8064 */ 8065 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8066 return 0; 8067 8068 key.objectid = 1; 8069 key.type = BTRFS_DEV_EXTENT_KEY; 8070 key.offset = 0; 8071 8072 path = btrfs_alloc_path(); 8073 if (!path) 8074 return -ENOMEM; 8075 8076 path->reada = READA_FORWARD; 8077 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8078 if (ret < 0) 8079 goto out; 8080 8081 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8082 ret = btrfs_next_leaf(root, path); 8083 if (ret < 0) 8084 goto out; 8085 /* No dev extents at all? Not good */ 8086 if (ret > 0) { 8087 ret = -EUCLEAN; 8088 goto out; 8089 } 8090 } 8091 while (1) { 8092 struct extent_buffer *leaf = path->nodes[0]; 8093 struct btrfs_dev_extent *dext; 8094 int slot = path->slots[0]; 8095 u64 chunk_offset; 8096 u64 physical_offset; 8097 u64 physical_len; 8098 u64 devid; 8099 8100 btrfs_item_key_to_cpu(leaf, &key, slot); 8101 if (key.type != BTRFS_DEV_EXTENT_KEY) 8102 break; 8103 devid = key.objectid; 8104 physical_offset = key.offset; 8105 8106 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8107 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8108 physical_len = btrfs_dev_extent_length(leaf, dext); 8109 8110 /* Check if this dev extent overlaps with the previous one */ 8111 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8112 btrfs_err(fs_info, 8113 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8114 devid, physical_offset, prev_dev_ext_end); 8115 ret = -EUCLEAN; 8116 goto out; 8117 } 8118 8119 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8120 physical_offset, physical_len); 8121 if (ret < 0) 8122 goto out; 8123 prev_devid = devid; 8124 prev_dev_ext_end = physical_offset + physical_len; 8125 8126 ret = btrfs_next_item(root, path); 8127 if (ret < 0) 8128 goto out; 8129 if (ret > 0) { 8130 ret = 0; 8131 break; 8132 } 8133 } 8134 8135 /* Ensure all chunks have corresponding dev extents */ 8136 ret = verify_chunk_dev_extent_mapping(fs_info); 8137 out: 8138 btrfs_free_path(path); 8139 return ret; 8140 } 8141 8142 /* 8143 * Check whether the given block group or device is pinned by any inode being 8144 * used as a swapfile. 8145 */ 8146 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8147 { 8148 struct btrfs_swapfile_pin *sp; 8149 struct rb_node *node; 8150 8151 spin_lock(&fs_info->swapfile_pins_lock); 8152 node = fs_info->swapfile_pins.rb_node; 8153 while (node) { 8154 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8155 if (ptr < sp->ptr) 8156 node = node->rb_left; 8157 else if (ptr > sp->ptr) 8158 node = node->rb_right; 8159 else 8160 break; 8161 } 8162 spin_unlock(&fs_info->swapfile_pins_lock); 8163 return node != NULL; 8164 } 8165 8166 static int relocating_repair_kthread(void *data) 8167 { 8168 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8169 struct btrfs_fs_info *fs_info = cache->fs_info; 8170 u64 target; 8171 int ret = 0; 8172 8173 target = cache->start; 8174 btrfs_put_block_group(cache); 8175 8176 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8177 btrfs_info(fs_info, 8178 "zoned: skip relocating block group %llu to repair: EBUSY", 8179 target); 8180 return -EBUSY; 8181 } 8182 8183 mutex_lock(&fs_info->reclaim_bgs_lock); 8184 8185 /* Ensure block group still exists */ 8186 cache = btrfs_lookup_block_group(fs_info, target); 8187 if (!cache) 8188 goto out; 8189 8190 if (!cache->relocating_repair) 8191 goto out; 8192 8193 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8194 if (ret < 0) 8195 goto out; 8196 8197 btrfs_info(fs_info, 8198 "zoned: relocating block group %llu to repair IO failure", 8199 target); 8200 ret = btrfs_relocate_chunk(fs_info, target); 8201 8202 out: 8203 if (cache) 8204 btrfs_put_block_group(cache); 8205 mutex_unlock(&fs_info->reclaim_bgs_lock); 8206 btrfs_exclop_finish(fs_info); 8207 8208 return ret; 8209 } 8210 8211 int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8212 { 8213 struct btrfs_block_group *cache; 8214 8215 /* Do not attempt to repair in degraded state */ 8216 if (btrfs_test_opt(fs_info, DEGRADED)) 8217 return 0; 8218 8219 cache = btrfs_lookup_block_group(fs_info, logical); 8220 if (!cache) 8221 return 0; 8222 8223 spin_lock(&cache->lock); 8224 if (cache->relocating_repair) { 8225 spin_unlock(&cache->lock); 8226 btrfs_put_block_group(cache); 8227 return 0; 8228 } 8229 cache->relocating_repair = 1; 8230 spin_unlock(&cache->lock); 8231 8232 kthread_run(relocating_repair_kthread, cache, 8233 "btrfs-relocating-repair"); 8234 8235 return 0; 8236 } 8237