1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 12 #define DM_MSG_PREFIX "zoned" 13 14 #define DMZ_MIN_BIOS 8192 15 16 /* 17 * Zone BIO context. 18 */ 19 struct dmz_bioctx { 20 struct dmz_dev *dev; 21 struct dm_zone *zone; 22 struct bio *bio; 23 refcount_t ref; 24 }; 25 26 /* 27 * Chunk work descriptor. 28 */ 29 struct dm_chunk_work { 30 struct work_struct work; 31 refcount_t refcount; 32 struct dmz_target *target; 33 unsigned int chunk; 34 struct bio_list bio_list; 35 }; 36 37 /* 38 * Target descriptor. 39 */ 40 struct dmz_target { 41 struct dm_dev **ddev; 42 unsigned int nr_ddevs; 43 44 unsigned int flags; 45 46 /* Zoned block device information */ 47 struct dmz_dev *dev; 48 49 /* For metadata handling */ 50 struct dmz_metadata *metadata; 51 52 /* For chunk work */ 53 struct radix_tree_root chunk_rxtree; 54 struct workqueue_struct *chunk_wq; 55 struct mutex chunk_lock; 56 57 /* For cloned BIOs to zones */ 58 struct bio_set bio_set; 59 60 /* For flush */ 61 spinlock_t flush_lock; 62 struct bio_list flush_list; 63 struct delayed_work flush_work; 64 struct workqueue_struct *flush_wq; 65 }; 66 67 /* 68 * Flush intervals (seconds). 69 */ 70 #define DMZ_FLUSH_PERIOD (10 * HZ) 71 72 /* 73 * Target BIO completion. 74 */ 75 static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) 76 { 77 struct dmz_bioctx *bioctx = 78 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 79 80 if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) 81 bio->bi_status = status; 82 if (bioctx->dev && bio->bi_status != BLK_STS_OK) 83 bioctx->dev->flags |= DMZ_CHECK_BDEV; 84 85 if (refcount_dec_and_test(&bioctx->ref)) { 86 struct dm_zone *zone = bioctx->zone; 87 88 if (zone) { 89 if (bio->bi_status != BLK_STS_OK && 90 bio_op(bio) == REQ_OP_WRITE && 91 dmz_is_seq(zone)) 92 set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 93 dmz_deactivate_zone(zone); 94 } 95 bio_endio(bio); 96 } 97 } 98 99 /* 100 * Completion callback for an internally cloned target BIO. This terminates the 101 * target BIO when there are no more references to its context. 102 */ 103 static void dmz_clone_endio(struct bio *clone) 104 { 105 struct dmz_bioctx *bioctx = clone->bi_private; 106 blk_status_t status = clone->bi_status; 107 108 bio_put(clone); 109 dmz_bio_endio(bioctx->bio, status); 110 } 111 112 /* 113 * Issue a clone of a target BIO. The clone may only partially process the 114 * original target BIO. 115 */ 116 static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, 117 struct bio *bio, sector_t chunk_block, 118 unsigned int nr_blocks) 119 { 120 struct dmz_bioctx *bioctx = 121 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 122 struct dmz_dev *dev = zone->dev; 123 struct bio *clone; 124 125 if (dev->flags & DMZ_BDEV_DYING) 126 return -EIO; 127 128 clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set); 129 if (!clone) 130 return -ENOMEM; 131 132 bioctx->dev = dev; 133 clone->bi_iter.bi_sector = 134 dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); 135 clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; 136 clone->bi_end_io = dmz_clone_endio; 137 clone->bi_private = bioctx; 138 139 bio_advance(bio, clone->bi_iter.bi_size); 140 141 refcount_inc(&bioctx->ref); 142 submit_bio_noacct(clone); 143 144 if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) 145 zone->wp_block += nr_blocks; 146 147 return 0; 148 } 149 150 /* 151 * Zero out pages of discarded blocks accessed by a read BIO. 152 */ 153 static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio, 154 sector_t chunk_block, unsigned int nr_blocks) 155 { 156 unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT; 157 158 /* Clear nr_blocks */ 159 swap(bio->bi_iter.bi_size, size); 160 zero_fill_bio(bio); 161 swap(bio->bi_iter.bi_size, size); 162 163 bio_advance(bio, size); 164 } 165 166 /* 167 * Process a read BIO. 168 */ 169 static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, 170 struct bio *bio) 171 { 172 struct dmz_metadata *zmd = dmz->metadata; 173 sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); 174 unsigned int nr_blocks = dmz_bio_blocks(bio); 175 sector_t end_block = chunk_block + nr_blocks; 176 struct dm_zone *rzone, *bzone; 177 int ret; 178 179 /* Read into unmapped chunks need only zeroing the BIO buffer */ 180 if (!zone) { 181 zero_fill_bio(bio); 182 return 0; 183 } 184 185 DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", 186 dmz_metadata_label(zmd), 187 (unsigned long long)dmz_bio_chunk(zmd, bio), 188 (dmz_is_rnd(zone) ? "RND" : 189 (dmz_is_cache(zone) ? "CACHE" : "SEQ")), 190 zone->id, 191 (unsigned long long)chunk_block, nr_blocks); 192 193 /* Check block validity to determine the read location */ 194 bzone = zone->bzone; 195 while (chunk_block < end_block) { 196 nr_blocks = 0; 197 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 198 chunk_block < zone->wp_block) { 199 /* Test block validity in the data zone */ 200 ret = dmz_block_valid(zmd, zone, chunk_block); 201 if (ret < 0) 202 return ret; 203 if (ret > 0) { 204 /* Read data zone blocks */ 205 nr_blocks = ret; 206 rzone = zone; 207 } 208 } 209 210 /* 211 * No valid blocks found in the data zone. 212 * Check the buffer zone, if there is one. 213 */ 214 if (!nr_blocks && bzone) { 215 ret = dmz_block_valid(zmd, bzone, chunk_block); 216 if (ret < 0) 217 return ret; 218 if (ret > 0) { 219 /* Read buffer zone blocks */ 220 nr_blocks = ret; 221 rzone = bzone; 222 } 223 } 224 225 if (nr_blocks) { 226 /* Valid blocks found: read them */ 227 nr_blocks = min_t(unsigned int, nr_blocks, 228 end_block - chunk_block); 229 ret = dmz_submit_bio(dmz, rzone, bio, 230 chunk_block, nr_blocks); 231 if (ret) 232 return ret; 233 chunk_block += nr_blocks; 234 } else { 235 /* No valid block: zeroout the current BIO block */ 236 dmz_handle_read_zero(dmz, bio, chunk_block, 1); 237 chunk_block++; 238 } 239 } 240 241 return 0; 242 } 243 244 /* 245 * Write blocks directly in a data zone, at the write pointer. 246 * If a buffer zone is assigned, invalidate the blocks written 247 * in place. 248 */ 249 static int dmz_handle_direct_write(struct dmz_target *dmz, 250 struct dm_zone *zone, struct bio *bio, 251 sector_t chunk_block, 252 unsigned int nr_blocks) 253 { 254 struct dmz_metadata *zmd = dmz->metadata; 255 struct dm_zone *bzone = zone->bzone; 256 int ret; 257 258 if (dmz_is_readonly(zone)) 259 return -EROFS; 260 261 /* Submit write */ 262 ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks); 263 if (ret) 264 return ret; 265 266 /* 267 * Validate the blocks in the data zone and invalidate 268 * in the buffer zone, if there is one. 269 */ 270 ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks); 271 if (ret == 0 && bzone) 272 ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks); 273 274 return ret; 275 } 276 277 /* 278 * Write blocks in the buffer zone of @zone. 279 * If no buffer zone is assigned yet, get one. 280 * Called with @zone write locked. 281 */ 282 static int dmz_handle_buffered_write(struct dmz_target *dmz, 283 struct dm_zone *zone, struct bio *bio, 284 sector_t chunk_block, 285 unsigned int nr_blocks) 286 { 287 struct dmz_metadata *zmd = dmz->metadata; 288 struct dm_zone *bzone; 289 int ret; 290 291 /* Get the buffer zone. One will be allocated if needed */ 292 bzone = dmz_get_chunk_buffer(zmd, zone); 293 if (IS_ERR(bzone)) 294 return PTR_ERR(bzone); 295 296 if (dmz_is_readonly(bzone)) 297 return -EROFS; 298 299 /* Submit write */ 300 ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks); 301 if (ret) 302 return ret; 303 304 /* 305 * Validate the blocks in the buffer zone 306 * and invalidate in the data zone. 307 */ 308 ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks); 309 if (ret == 0 && chunk_block < zone->wp_block) 310 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); 311 312 return ret; 313 } 314 315 /* 316 * Process a write BIO. 317 */ 318 static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, 319 struct bio *bio) 320 { 321 struct dmz_metadata *zmd = dmz->metadata; 322 sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); 323 unsigned int nr_blocks = dmz_bio_blocks(bio); 324 325 if (!zone) 326 return -ENOSPC; 327 328 DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", 329 dmz_metadata_label(zmd), 330 (unsigned long long)dmz_bio_chunk(zmd, bio), 331 (dmz_is_rnd(zone) ? "RND" : 332 (dmz_is_cache(zone) ? "CACHE" : "SEQ")), 333 zone->id, 334 (unsigned long long)chunk_block, nr_blocks); 335 336 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 337 chunk_block == zone->wp_block) { 338 /* 339 * zone is a random zone or it is a sequential zone 340 * and the BIO is aligned to the zone write pointer: 341 * direct write the zone. 342 */ 343 return dmz_handle_direct_write(dmz, zone, bio, 344 chunk_block, nr_blocks); 345 } 346 347 /* 348 * This is an unaligned write in a sequential zone: 349 * use buffered write. 350 */ 351 return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks); 352 } 353 354 /* 355 * Process a discard BIO. 356 */ 357 static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, 358 struct bio *bio) 359 { 360 struct dmz_metadata *zmd = dmz->metadata; 361 sector_t block = dmz_bio_block(bio); 362 unsigned int nr_blocks = dmz_bio_blocks(bio); 363 sector_t chunk_block = dmz_chunk_block(zmd, block); 364 int ret = 0; 365 366 /* For unmapped chunks, there is nothing to do */ 367 if (!zone) 368 return 0; 369 370 if (dmz_is_readonly(zone)) 371 return -EROFS; 372 373 DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks", 374 dmz_metadata_label(dmz->metadata), 375 (unsigned long long)dmz_bio_chunk(zmd, bio), 376 zone->id, 377 (unsigned long long)chunk_block, nr_blocks); 378 379 /* 380 * Invalidate blocks in the data zone and its 381 * buffer zone if one is mapped. 382 */ 383 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 384 chunk_block < zone->wp_block) 385 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); 386 if (ret == 0 && zone->bzone) 387 ret = dmz_invalidate_blocks(zmd, zone->bzone, 388 chunk_block, nr_blocks); 389 return ret; 390 } 391 392 /* 393 * Process a BIO. 394 */ 395 static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, 396 struct bio *bio) 397 { 398 struct dmz_bioctx *bioctx = 399 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 400 struct dmz_metadata *zmd = dmz->metadata; 401 struct dm_zone *zone; 402 int ret; 403 404 dmz_lock_metadata(zmd); 405 406 /* 407 * Get the data zone mapping the chunk. There may be no 408 * mapping for read and discard. If a mapping is obtained, 409 + the zone returned will be set to active state. 410 */ 411 zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio), 412 bio_op(bio)); 413 if (IS_ERR(zone)) { 414 ret = PTR_ERR(zone); 415 goto out; 416 } 417 418 /* Process the BIO */ 419 if (zone) { 420 dmz_activate_zone(zone); 421 bioctx->zone = zone; 422 dmz_reclaim_bio_acc(zone->dev->reclaim); 423 } 424 425 switch (bio_op(bio)) { 426 case REQ_OP_READ: 427 ret = dmz_handle_read(dmz, zone, bio); 428 break; 429 case REQ_OP_WRITE: 430 ret = dmz_handle_write(dmz, zone, bio); 431 break; 432 case REQ_OP_DISCARD: 433 case REQ_OP_WRITE_ZEROES: 434 ret = dmz_handle_discard(dmz, zone, bio); 435 break; 436 default: 437 DMERR("(%s): Unsupported BIO operation 0x%x", 438 dmz_metadata_label(dmz->metadata), bio_op(bio)); 439 ret = -EIO; 440 } 441 442 /* 443 * Release the chunk mapping. This will check that the mapping 444 * is still valid, that is, that the zone used still has valid blocks. 445 */ 446 if (zone) 447 dmz_put_chunk_mapping(zmd, zone); 448 out: 449 dmz_bio_endio(bio, errno_to_blk_status(ret)); 450 451 dmz_unlock_metadata(zmd); 452 } 453 454 /* 455 * Increment a chunk reference counter. 456 */ 457 static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) 458 { 459 refcount_inc(&cw->refcount); 460 } 461 462 /* 463 * Decrement a chunk work reference count and 464 * free it if it becomes 0. 465 */ 466 static void dmz_put_chunk_work(struct dm_chunk_work *cw) 467 { 468 if (refcount_dec_and_test(&cw->refcount)) { 469 WARN_ON(!bio_list_empty(&cw->bio_list)); 470 radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk); 471 kfree(cw); 472 } 473 } 474 475 /* 476 * Chunk BIO work function. 477 */ 478 static void dmz_chunk_work(struct work_struct *work) 479 { 480 struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work); 481 struct dmz_target *dmz = cw->target; 482 struct bio *bio; 483 484 mutex_lock(&dmz->chunk_lock); 485 486 /* Process the chunk BIOs */ 487 while ((bio = bio_list_pop(&cw->bio_list))) { 488 mutex_unlock(&dmz->chunk_lock); 489 dmz_handle_bio(dmz, cw, bio); 490 mutex_lock(&dmz->chunk_lock); 491 dmz_put_chunk_work(cw); 492 } 493 494 /* Queueing the work incremented the work refcount */ 495 dmz_put_chunk_work(cw); 496 497 mutex_unlock(&dmz->chunk_lock); 498 } 499 500 /* 501 * Flush work. 502 */ 503 static void dmz_flush_work(struct work_struct *work) 504 { 505 struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work); 506 struct bio *bio; 507 int ret; 508 509 /* Flush dirty metadata blocks */ 510 ret = dmz_flush_metadata(dmz->metadata); 511 if (ret) 512 DMDEBUG("(%s): Metadata flush failed, rc=%d", 513 dmz_metadata_label(dmz->metadata), ret); 514 515 /* Process queued flush requests */ 516 while (1) { 517 spin_lock(&dmz->flush_lock); 518 bio = bio_list_pop(&dmz->flush_list); 519 spin_unlock(&dmz->flush_lock); 520 521 if (!bio) 522 break; 523 524 dmz_bio_endio(bio, errno_to_blk_status(ret)); 525 } 526 527 queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 528 } 529 530 /* 531 * Get a chunk work and start it to process a new BIO. 532 * If the BIO chunk has no work yet, create one. 533 */ 534 static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) 535 { 536 unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio); 537 struct dm_chunk_work *cw; 538 int ret = 0; 539 540 mutex_lock(&dmz->chunk_lock); 541 542 /* Get the BIO chunk work. If one is not active yet, create one */ 543 cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); 544 if (cw) { 545 dmz_get_chunk_work(cw); 546 } else { 547 /* Create a new chunk work */ 548 cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); 549 if (unlikely(!cw)) { 550 ret = -ENOMEM; 551 goto out; 552 } 553 554 INIT_WORK(&cw->work, dmz_chunk_work); 555 refcount_set(&cw->refcount, 1); 556 cw->target = dmz; 557 cw->chunk = chunk; 558 bio_list_init(&cw->bio_list); 559 560 ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); 561 if (unlikely(ret)) { 562 kfree(cw); 563 goto out; 564 } 565 } 566 567 bio_list_add(&cw->bio_list, bio); 568 569 if (queue_work(dmz->chunk_wq, &cw->work)) 570 dmz_get_chunk_work(cw); 571 out: 572 mutex_unlock(&dmz->chunk_lock); 573 return ret; 574 } 575 576 /* 577 * Check if the backing device is being removed. If it's on the way out, 578 * start failing I/O. Reclaim and metadata components also call this 579 * function to cleanly abort operation in the event of such failure. 580 */ 581 bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) 582 { 583 if (dmz_dev->flags & DMZ_BDEV_DYING) 584 return true; 585 586 if (dmz_dev->flags & DMZ_CHECK_BDEV) 587 return !dmz_check_bdev(dmz_dev); 588 589 if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { 590 dmz_dev_warn(dmz_dev, "Backing device queue dying"); 591 dmz_dev->flags |= DMZ_BDEV_DYING; 592 } 593 594 return dmz_dev->flags & DMZ_BDEV_DYING; 595 } 596 597 /* 598 * Check the backing device availability. This detects such events as 599 * backing device going offline due to errors, media removals, etc. 600 * This check is less efficient than dmz_bdev_is_dying() and should 601 * only be performed as a part of error handling. 602 */ 603 bool dmz_check_bdev(struct dmz_dev *dmz_dev) 604 { 605 struct gendisk *disk; 606 607 dmz_dev->flags &= ~DMZ_CHECK_BDEV; 608 609 if (dmz_bdev_is_dying(dmz_dev)) 610 return false; 611 612 disk = dmz_dev->bdev->bd_disk; 613 if (disk->fops->check_events && 614 disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { 615 dmz_dev_warn(dmz_dev, "Backing device offline"); 616 dmz_dev->flags |= DMZ_BDEV_DYING; 617 } 618 619 return !(dmz_dev->flags & DMZ_BDEV_DYING); 620 } 621 622 /* 623 * Process a new BIO. 624 */ 625 static int dmz_map(struct dm_target *ti, struct bio *bio) 626 { 627 struct dmz_target *dmz = ti->private; 628 struct dmz_metadata *zmd = dmz->metadata; 629 struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 630 sector_t sector = bio->bi_iter.bi_sector; 631 unsigned int nr_sectors = bio_sectors(bio); 632 sector_t chunk_sector; 633 int ret; 634 635 if (dmz_dev_is_dying(zmd)) 636 return DM_MAPIO_KILL; 637 638 DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", 639 dmz_metadata_label(zmd), 640 bio_op(bio), (unsigned long long)sector, nr_sectors, 641 (unsigned long long)dmz_bio_chunk(zmd, bio), 642 (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), 643 (unsigned int)dmz_bio_blocks(bio)); 644 645 if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) 646 return DM_MAPIO_REMAPPED; 647 648 /* The BIO should be block aligned */ 649 if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK)) 650 return DM_MAPIO_KILL; 651 652 /* Initialize the BIO context */ 653 bioctx->dev = NULL; 654 bioctx->zone = NULL; 655 bioctx->bio = bio; 656 refcount_set(&bioctx->ref, 1); 657 658 /* Set the BIO pending in the flush list */ 659 if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { 660 spin_lock(&dmz->flush_lock); 661 bio_list_add(&dmz->flush_list, bio); 662 spin_unlock(&dmz->flush_lock); 663 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0); 664 return DM_MAPIO_SUBMITTED; 665 } 666 667 /* Split zone BIOs to fit entirely into a zone */ 668 chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); 669 if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd)) 670 dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector); 671 672 /* Now ready to handle this BIO */ 673 ret = dmz_queue_chunk_work(dmz, bio); 674 if (ret) { 675 DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i", 676 dmz_metadata_label(zmd), 677 bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), 678 ret); 679 return DM_MAPIO_REQUEUE; 680 } 681 682 return DM_MAPIO_SUBMITTED; 683 } 684 685 /* 686 * Get zoned device information. 687 */ 688 static int dmz_get_zoned_device(struct dm_target *ti, char *path, 689 int idx, int nr_devs) 690 { 691 struct dmz_target *dmz = ti->private; 692 struct dm_dev *ddev; 693 struct dmz_dev *dev; 694 int ret; 695 struct block_device *bdev; 696 697 /* Get the target device */ 698 ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); 699 if (ret) { 700 ti->error = "Get target device failed"; 701 return ret; 702 } 703 704 bdev = ddev->bdev; 705 if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { 706 if (nr_devs == 1) { 707 ti->error = "Invalid regular device"; 708 goto err; 709 } 710 if (idx != 0) { 711 ti->error = "First device must be a regular device"; 712 goto err; 713 } 714 if (dmz->ddev[0]) { 715 ti->error = "Too many regular devices"; 716 goto err; 717 } 718 dev = &dmz->dev[idx]; 719 dev->flags = DMZ_BDEV_REGULAR; 720 } else { 721 if (dmz->ddev[idx]) { 722 ti->error = "Too many zoned devices"; 723 goto err; 724 } 725 if (nr_devs > 1 && idx == 0) { 726 ti->error = "First device must be a regular device"; 727 goto err; 728 } 729 dev = &dmz->dev[idx]; 730 } 731 dev->bdev = bdev; 732 dev->dev_idx = idx; 733 734 dev->capacity = bdev_nr_sectors(bdev); 735 if (ti->begin) { 736 ti->error = "Partial mapping is not supported"; 737 goto err; 738 } 739 740 dmz->ddev[idx] = ddev; 741 742 return 0; 743 err: 744 dm_put_device(ti, ddev); 745 return -EINVAL; 746 } 747 748 /* 749 * Cleanup zoned device information. 750 */ 751 static void dmz_put_zoned_device(struct dm_target *ti) 752 { 753 struct dmz_target *dmz = ti->private; 754 int i; 755 756 for (i = 0; i < dmz->nr_ddevs; i++) { 757 if (dmz->ddev[i]) { 758 dm_put_device(ti, dmz->ddev[i]); 759 dmz->ddev[i] = NULL; 760 } 761 } 762 } 763 764 static int dmz_fixup_devices(struct dm_target *ti) 765 { 766 struct dmz_target *dmz = ti->private; 767 struct dmz_dev *reg_dev, *zoned_dev; 768 struct request_queue *q; 769 sector_t zone_nr_sectors = 0; 770 int i; 771 772 /* 773 * When we have more than on devices, the first one must be a 774 * regular block device and the others zoned block devices. 775 */ 776 if (dmz->nr_ddevs > 1) { 777 reg_dev = &dmz->dev[0]; 778 if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { 779 ti->error = "Primary disk is not a regular device"; 780 return -EINVAL; 781 } 782 for (i = 1; i < dmz->nr_ddevs; i++) { 783 zoned_dev = &dmz->dev[i]; 784 if (zoned_dev->flags & DMZ_BDEV_REGULAR) { 785 ti->error = "Secondary disk is not a zoned device"; 786 return -EINVAL; 787 } 788 q = bdev_get_queue(zoned_dev->bdev); 789 if (zone_nr_sectors && 790 zone_nr_sectors != blk_queue_zone_sectors(q)) { 791 ti->error = "Zone nr sectors mismatch"; 792 return -EINVAL; 793 } 794 zone_nr_sectors = blk_queue_zone_sectors(q); 795 zoned_dev->zone_nr_sectors = zone_nr_sectors; 796 zoned_dev->nr_zones = 797 blkdev_nr_zones(zoned_dev->bdev->bd_disk); 798 } 799 } else { 800 reg_dev = NULL; 801 zoned_dev = &dmz->dev[0]; 802 if (zoned_dev->flags & DMZ_BDEV_REGULAR) { 803 ti->error = "Disk is not a zoned device"; 804 return -EINVAL; 805 } 806 q = bdev_get_queue(zoned_dev->bdev); 807 zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); 808 zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); 809 } 810 811 if (reg_dev) { 812 sector_t zone_offset; 813 814 reg_dev->zone_nr_sectors = zone_nr_sectors; 815 reg_dev->nr_zones = 816 DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, 817 reg_dev->zone_nr_sectors); 818 reg_dev->zone_offset = 0; 819 zone_offset = reg_dev->nr_zones; 820 for (i = 1; i < dmz->nr_ddevs; i++) { 821 dmz->dev[i].zone_offset = zone_offset; 822 zone_offset += dmz->dev[i].nr_zones; 823 } 824 } 825 return 0; 826 } 827 828 /* 829 * Setup target. 830 */ 831 static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) 832 { 833 struct dmz_target *dmz; 834 int ret, i; 835 836 /* Check arguments */ 837 if (argc < 1) { 838 ti->error = "Invalid argument count"; 839 return -EINVAL; 840 } 841 842 /* Allocate and initialize the target descriptor */ 843 dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL); 844 if (!dmz) { 845 ti->error = "Unable to allocate the zoned target descriptor"; 846 return -ENOMEM; 847 } 848 dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); 849 if (!dmz->dev) { 850 ti->error = "Unable to allocate the zoned device descriptors"; 851 kfree(dmz); 852 return -ENOMEM; 853 } 854 dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); 855 if (!dmz->ddev) { 856 ti->error = "Unable to allocate the dm device descriptors"; 857 ret = -ENOMEM; 858 goto err; 859 } 860 dmz->nr_ddevs = argc; 861 862 ti->private = dmz; 863 864 /* Get the target zoned block device */ 865 for (i = 0; i < argc; i++) { 866 ret = dmz_get_zoned_device(ti, argv[i], i, argc); 867 if (ret) 868 goto err_dev; 869 } 870 ret = dmz_fixup_devices(ti); 871 if (ret) 872 goto err_dev; 873 874 /* Initialize metadata */ 875 ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, 876 dm_table_device_name(ti->table)); 877 if (ret) { 878 ti->error = "Metadata initialization failed"; 879 goto err_dev; 880 } 881 882 /* Set target (no write same support) */ 883 ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata); 884 ti->num_flush_bios = 1; 885 ti->num_discard_bios = 1; 886 ti->num_write_zeroes_bios = 1; 887 ti->per_io_data_size = sizeof(struct dmz_bioctx); 888 ti->flush_supported = true; 889 ti->discards_supported = true; 890 891 /* The exposed capacity is the number of chunks that can be mapped */ 892 ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << 893 dmz_zone_nr_sectors_shift(dmz->metadata); 894 895 /* Zone BIO */ 896 ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); 897 if (ret) { 898 ti->error = "Create BIO set failed"; 899 goto err_meta; 900 } 901 902 /* Chunk BIO work */ 903 mutex_init(&dmz->chunk_lock); 904 INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); 905 dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", 906 WQ_MEM_RECLAIM | WQ_UNBOUND, 0, 907 dmz_metadata_label(dmz->metadata)); 908 if (!dmz->chunk_wq) { 909 ti->error = "Create chunk workqueue failed"; 910 ret = -ENOMEM; 911 goto err_bio; 912 } 913 914 /* Flush work */ 915 spin_lock_init(&dmz->flush_lock); 916 bio_list_init(&dmz->flush_list); 917 INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work); 918 dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM, 919 dmz_metadata_label(dmz->metadata)); 920 if (!dmz->flush_wq) { 921 ti->error = "Create flush workqueue failed"; 922 ret = -ENOMEM; 923 goto err_cwq; 924 } 925 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 926 927 /* Initialize reclaim */ 928 for (i = 0; i < dmz->nr_ddevs; i++) { 929 ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i); 930 if (ret) { 931 ti->error = "Zone reclaim initialization failed"; 932 goto err_fwq; 933 } 934 } 935 936 DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", 937 dmz_metadata_label(dmz->metadata), 938 (unsigned long long)ti->len, 939 (unsigned long long)dmz_sect2blk(ti->len)); 940 941 return 0; 942 err_fwq: 943 destroy_workqueue(dmz->flush_wq); 944 err_cwq: 945 destroy_workqueue(dmz->chunk_wq); 946 err_bio: 947 mutex_destroy(&dmz->chunk_lock); 948 bioset_exit(&dmz->bio_set); 949 err_meta: 950 dmz_dtr_metadata(dmz->metadata); 951 err_dev: 952 dmz_put_zoned_device(ti); 953 err: 954 kfree(dmz->dev); 955 kfree(dmz); 956 957 return ret; 958 } 959 960 /* 961 * Cleanup target. 962 */ 963 static void dmz_dtr(struct dm_target *ti) 964 { 965 struct dmz_target *dmz = ti->private; 966 int i; 967 968 destroy_workqueue(dmz->chunk_wq); 969 970 for (i = 0; i < dmz->nr_ddevs; i++) 971 dmz_dtr_reclaim(dmz->dev[i].reclaim); 972 973 cancel_delayed_work_sync(&dmz->flush_work); 974 destroy_workqueue(dmz->flush_wq); 975 976 (void) dmz_flush_metadata(dmz->metadata); 977 978 dmz_dtr_metadata(dmz->metadata); 979 980 bioset_exit(&dmz->bio_set); 981 982 dmz_put_zoned_device(ti); 983 984 mutex_destroy(&dmz->chunk_lock); 985 986 kfree(dmz->dev); 987 kfree(dmz); 988 } 989 990 /* 991 * Setup target request queue limits. 992 */ 993 static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) 994 { 995 struct dmz_target *dmz = ti->private; 996 unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata); 997 998 limits->logical_block_size = DMZ_BLOCK_SIZE; 999 limits->physical_block_size = DMZ_BLOCK_SIZE; 1000 1001 blk_limits_io_min(limits, DMZ_BLOCK_SIZE); 1002 blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); 1003 1004 limits->discard_alignment = DMZ_BLOCK_SIZE; 1005 limits->discard_granularity = DMZ_BLOCK_SIZE; 1006 limits->max_discard_sectors = chunk_sectors; 1007 limits->max_hw_discard_sectors = chunk_sectors; 1008 limits->max_write_zeroes_sectors = chunk_sectors; 1009 1010 /* FS hint to try to align to the device zone size */ 1011 limits->chunk_sectors = chunk_sectors; 1012 limits->max_sectors = chunk_sectors; 1013 1014 /* We are exposing a drive-managed zoned block device */ 1015 limits->zoned = BLK_ZONED_NONE; 1016 } 1017 1018 /* 1019 * Pass on ioctl to the backend device. 1020 */ 1021 static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 1022 { 1023 struct dmz_target *dmz = ti->private; 1024 struct dmz_dev *dev = &dmz->dev[0]; 1025 1026 if (!dmz_check_bdev(dev)) 1027 return -EIO; 1028 1029 *bdev = dev->bdev; 1030 1031 return 0; 1032 } 1033 1034 /* 1035 * Stop works on suspend. 1036 */ 1037 static void dmz_suspend(struct dm_target *ti) 1038 { 1039 struct dmz_target *dmz = ti->private; 1040 int i; 1041 1042 flush_workqueue(dmz->chunk_wq); 1043 for (i = 0; i < dmz->nr_ddevs; i++) 1044 dmz_suspend_reclaim(dmz->dev[i].reclaim); 1045 cancel_delayed_work_sync(&dmz->flush_work); 1046 } 1047 1048 /* 1049 * Restart works on resume or if suspend failed. 1050 */ 1051 static void dmz_resume(struct dm_target *ti) 1052 { 1053 struct dmz_target *dmz = ti->private; 1054 int i; 1055 1056 queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 1057 for (i = 0; i < dmz->nr_ddevs; i++) 1058 dmz_resume_reclaim(dmz->dev[i].reclaim); 1059 } 1060 1061 static int dmz_iterate_devices(struct dm_target *ti, 1062 iterate_devices_callout_fn fn, void *data) 1063 { 1064 struct dmz_target *dmz = ti->private; 1065 unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); 1066 sector_t capacity; 1067 int i, r; 1068 1069 for (i = 0; i < dmz->nr_ddevs; i++) { 1070 capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); 1071 r = fn(ti, dmz->ddev[i], 0, capacity, data); 1072 if (r) 1073 break; 1074 } 1075 return r; 1076 } 1077 1078 static void dmz_status(struct dm_target *ti, status_type_t type, 1079 unsigned int status_flags, char *result, 1080 unsigned int maxlen) 1081 { 1082 struct dmz_target *dmz = ti->private; 1083 ssize_t sz = 0; 1084 char buf[BDEVNAME_SIZE]; 1085 struct dmz_dev *dev; 1086 int i; 1087 1088 switch (type) { 1089 case STATUSTYPE_INFO: 1090 DMEMIT("%u zones %u/%u cache", 1091 dmz_nr_zones(dmz->metadata), 1092 dmz_nr_unmap_cache_zones(dmz->metadata), 1093 dmz_nr_cache_zones(dmz->metadata)); 1094 for (i = 0; i < dmz->nr_ddevs; i++) { 1095 /* 1096 * For a multi-device setup the first device 1097 * contains only cache zones. 1098 */ 1099 if ((i == 0) && 1100 (dmz_nr_cache_zones(dmz->metadata) > 0)) 1101 continue; 1102 DMEMIT(" %u/%u random %u/%u sequential", 1103 dmz_nr_unmap_rnd_zones(dmz->metadata, i), 1104 dmz_nr_rnd_zones(dmz->metadata, i), 1105 dmz_nr_unmap_seq_zones(dmz->metadata, i), 1106 dmz_nr_seq_zones(dmz->metadata, i)); 1107 } 1108 break; 1109 case STATUSTYPE_TABLE: 1110 dev = &dmz->dev[0]; 1111 format_dev_t(buf, dev->bdev->bd_dev); 1112 DMEMIT("%s", buf); 1113 for (i = 1; i < dmz->nr_ddevs; i++) { 1114 dev = &dmz->dev[i]; 1115 format_dev_t(buf, dev->bdev->bd_dev); 1116 DMEMIT(" %s", buf); 1117 } 1118 break; 1119 case STATUSTYPE_IMA: 1120 *result = '\0'; 1121 break; 1122 } 1123 return; 1124 } 1125 1126 static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, 1127 char *result, unsigned int maxlen) 1128 { 1129 struct dmz_target *dmz = ti->private; 1130 int r = -EINVAL; 1131 1132 if (!strcasecmp(argv[0], "reclaim")) { 1133 int i; 1134 1135 for (i = 0; i < dmz->nr_ddevs; i++) 1136 dmz_schedule_reclaim(dmz->dev[i].reclaim); 1137 r = 0; 1138 } else 1139 DMERR("unrecognized message %s", argv[0]); 1140 return r; 1141 } 1142 1143 static struct target_type dmz_type = { 1144 .name = "zoned", 1145 .version = {2, 0, 0}, 1146 .features = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL, 1147 .module = THIS_MODULE, 1148 .ctr = dmz_ctr, 1149 .dtr = dmz_dtr, 1150 .map = dmz_map, 1151 .io_hints = dmz_io_hints, 1152 .prepare_ioctl = dmz_prepare_ioctl, 1153 .postsuspend = dmz_suspend, 1154 .resume = dmz_resume, 1155 .iterate_devices = dmz_iterate_devices, 1156 .status = dmz_status, 1157 .message = dmz_message, 1158 }; 1159 1160 static int __init dmz_init(void) 1161 { 1162 return dm_register_target(&dmz_type); 1163 } 1164 1165 static void __exit dmz_exit(void) 1166 { 1167 dm_unregister_target(&dmz_type); 1168 } 1169 1170 module_init(dmz_init); 1171 module_exit(dmz_exit); 1172 1173 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices"); 1174 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>"); 1175 MODULE_LICENSE("GPL"); 1176