1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 12 #define DM_MSG_PREFIX "zoned" 13 14 #define DMZ_MIN_BIOS 8192 15 16 /* 17 * Zone BIO context. 18 */ 19 struct dmz_bioctx { 20 struct dmz_dev *dev; 21 struct dm_zone *zone; 22 struct bio *bio; 23 refcount_t ref; 24 }; 25 26 /* 27 * Chunk work descriptor. 28 */ 29 struct dm_chunk_work { 30 struct work_struct work; 31 refcount_t refcount; 32 struct dmz_target *target; 33 unsigned int chunk; 34 struct bio_list bio_list; 35 }; 36 37 /* 38 * Target descriptor. 39 */ 40 struct dmz_target { 41 struct dm_dev **ddev; 42 unsigned int nr_ddevs; 43 44 unsigned int flags; 45 46 /* Zoned block device information */ 47 struct dmz_dev *dev; 48 49 /* For metadata handling */ 50 struct dmz_metadata *metadata; 51 52 /* For chunk work */ 53 struct radix_tree_root chunk_rxtree; 54 struct workqueue_struct *chunk_wq; 55 struct mutex chunk_lock; 56 57 /* For cloned BIOs to zones */ 58 struct bio_set bio_set; 59 60 /* For flush */ 61 spinlock_t flush_lock; 62 struct bio_list flush_list; 63 struct delayed_work flush_work; 64 struct workqueue_struct *flush_wq; 65 }; 66 67 /* 68 * Flush intervals (seconds). 69 */ 70 #define DMZ_FLUSH_PERIOD (10 * HZ) 71 72 /* 73 * Target BIO completion. 74 */ 75 static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) 76 { 77 struct dmz_bioctx *bioctx = 78 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 79 80 if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) 81 bio->bi_status = status; 82 if (bioctx->dev && bio->bi_status != BLK_STS_OK) 83 bioctx->dev->flags |= DMZ_CHECK_BDEV; 84 85 if (refcount_dec_and_test(&bioctx->ref)) { 86 struct dm_zone *zone = bioctx->zone; 87 88 if (zone) { 89 if (bio->bi_status != BLK_STS_OK && 90 bio_op(bio) == REQ_OP_WRITE && 91 dmz_is_seq(zone)) 92 set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 93 dmz_deactivate_zone(zone); 94 } 95 bio_endio(bio); 96 } 97 } 98 99 /* 100 * Completion callback for an internally cloned target BIO. This terminates the 101 * target BIO when there are no more references to its context. 102 */ 103 static void dmz_clone_endio(struct bio *clone) 104 { 105 struct dmz_bioctx *bioctx = clone->bi_private; 106 blk_status_t status = clone->bi_status; 107 108 bio_put(clone); 109 dmz_bio_endio(bioctx->bio, status); 110 } 111 112 /* 113 * Issue a clone of a target BIO. The clone may only partially process the 114 * original target BIO. 115 */ 116 static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, 117 struct bio *bio, sector_t chunk_block, 118 unsigned int nr_blocks) 119 { 120 struct dmz_bioctx *bioctx = 121 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 122 struct dmz_dev *dev = zone->dev; 123 struct bio *clone; 124 125 if (dev->flags & DMZ_BDEV_DYING) 126 return -EIO; 127 128 clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set); 129 if (!clone) 130 return -ENOMEM; 131 132 bio_set_dev(clone, dev->bdev); 133 bioctx->dev = dev; 134 clone->bi_iter.bi_sector = 135 dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); 136 clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; 137 clone->bi_end_io = dmz_clone_endio; 138 clone->bi_private = bioctx; 139 140 bio_advance(bio, clone->bi_iter.bi_size); 141 142 refcount_inc(&bioctx->ref); 143 submit_bio_noacct(clone); 144 145 if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) 146 zone->wp_block += nr_blocks; 147 148 return 0; 149 } 150 151 /* 152 * Zero out pages of discarded blocks accessed by a read BIO. 153 */ 154 static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio, 155 sector_t chunk_block, unsigned int nr_blocks) 156 { 157 unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT; 158 159 /* Clear nr_blocks */ 160 swap(bio->bi_iter.bi_size, size); 161 zero_fill_bio(bio); 162 swap(bio->bi_iter.bi_size, size); 163 164 bio_advance(bio, size); 165 } 166 167 /* 168 * Process a read BIO. 169 */ 170 static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, 171 struct bio *bio) 172 { 173 struct dmz_metadata *zmd = dmz->metadata; 174 sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); 175 unsigned int nr_blocks = dmz_bio_blocks(bio); 176 sector_t end_block = chunk_block + nr_blocks; 177 struct dm_zone *rzone, *bzone; 178 int ret; 179 180 /* Read into unmapped chunks need only zeroing the BIO buffer */ 181 if (!zone) { 182 zero_fill_bio(bio); 183 return 0; 184 } 185 186 DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", 187 dmz_metadata_label(zmd), 188 (unsigned long long)dmz_bio_chunk(zmd, bio), 189 (dmz_is_rnd(zone) ? "RND" : 190 (dmz_is_cache(zone) ? "CACHE" : "SEQ")), 191 zone->id, 192 (unsigned long long)chunk_block, nr_blocks); 193 194 /* Check block validity to determine the read location */ 195 bzone = zone->bzone; 196 while (chunk_block < end_block) { 197 nr_blocks = 0; 198 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 199 chunk_block < zone->wp_block) { 200 /* Test block validity in the data zone */ 201 ret = dmz_block_valid(zmd, zone, chunk_block); 202 if (ret < 0) 203 return ret; 204 if (ret > 0) { 205 /* Read data zone blocks */ 206 nr_blocks = ret; 207 rzone = zone; 208 } 209 } 210 211 /* 212 * No valid blocks found in the data zone. 213 * Check the buffer zone, if there is one. 214 */ 215 if (!nr_blocks && bzone) { 216 ret = dmz_block_valid(zmd, bzone, chunk_block); 217 if (ret < 0) 218 return ret; 219 if (ret > 0) { 220 /* Read buffer zone blocks */ 221 nr_blocks = ret; 222 rzone = bzone; 223 } 224 } 225 226 if (nr_blocks) { 227 /* Valid blocks found: read them */ 228 nr_blocks = min_t(unsigned int, nr_blocks, 229 end_block - chunk_block); 230 ret = dmz_submit_bio(dmz, rzone, bio, 231 chunk_block, nr_blocks); 232 if (ret) 233 return ret; 234 chunk_block += nr_blocks; 235 } else { 236 /* No valid block: zeroout the current BIO block */ 237 dmz_handle_read_zero(dmz, bio, chunk_block, 1); 238 chunk_block++; 239 } 240 } 241 242 return 0; 243 } 244 245 /* 246 * Write blocks directly in a data zone, at the write pointer. 247 * If a buffer zone is assigned, invalidate the blocks written 248 * in place. 249 */ 250 static int dmz_handle_direct_write(struct dmz_target *dmz, 251 struct dm_zone *zone, struct bio *bio, 252 sector_t chunk_block, 253 unsigned int nr_blocks) 254 { 255 struct dmz_metadata *zmd = dmz->metadata; 256 struct dm_zone *bzone = zone->bzone; 257 int ret; 258 259 if (dmz_is_readonly(zone)) 260 return -EROFS; 261 262 /* Submit write */ 263 ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks); 264 if (ret) 265 return ret; 266 267 /* 268 * Validate the blocks in the data zone and invalidate 269 * in the buffer zone, if there is one. 270 */ 271 ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks); 272 if (ret == 0 && bzone) 273 ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks); 274 275 return ret; 276 } 277 278 /* 279 * Write blocks in the buffer zone of @zone. 280 * If no buffer zone is assigned yet, get one. 281 * Called with @zone write locked. 282 */ 283 static int dmz_handle_buffered_write(struct dmz_target *dmz, 284 struct dm_zone *zone, struct bio *bio, 285 sector_t chunk_block, 286 unsigned int nr_blocks) 287 { 288 struct dmz_metadata *zmd = dmz->metadata; 289 struct dm_zone *bzone; 290 int ret; 291 292 /* Get the buffer zone. One will be allocated if needed */ 293 bzone = dmz_get_chunk_buffer(zmd, zone); 294 if (IS_ERR(bzone)) 295 return PTR_ERR(bzone); 296 297 if (dmz_is_readonly(bzone)) 298 return -EROFS; 299 300 /* Submit write */ 301 ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks); 302 if (ret) 303 return ret; 304 305 /* 306 * Validate the blocks in the buffer zone 307 * and invalidate in the data zone. 308 */ 309 ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks); 310 if (ret == 0 && chunk_block < zone->wp_block) 311 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); 312 313 return ret; 314 } 315 316 /* 317 * Process a write BIO. 318 */ 319 static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, 320 struct bio *bio) 321 { 322 struct dmz_metadata *zmd = dmz->metadata; 323 sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); 324 unsigned int nr_blocks = dmz_bio_blocks(bio); 325 326 if (!zone) 327 return -ENOSPC; 328 329 DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", 330 dmz_metadata_label(zmd), 331 (unsigned long long)dmz_bio_chunk(zmd, bio), 332 (dmz_is_rnd(zone) ? "RND" : 333 (dmz_is_cache(zone) ? "CACHE" : "SEQ")), 334 zone->id, 335 (unsigned long long)chunk_block, nr_blocks); 336 337 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 338 chunk_block == zone->wp_block) { 339 /* 340 * zone is a random zone or it is a sequential zone 341 * and the BIO is aligned to the zone write pointer: 342 * direct write the zone. 343 */ 344 return dmz_handle_direct_write(dmz, zone, bio, 345 chunk_block, nr_blocks); 346 } 347 348 /* 349 * This is an unaligned write in a sequential zone: 350 * use buffered write. 351 */ 352 return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks); 353 } 354 355 /* 356 * Process a discard BIO. 357 */ 358 static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, 359 struct bio *bio) 360 { 361 struct dmz_metadata *zmd = dmz->metadata; 362 sector_t block = dmz_bio_block(bio); 363 unsigned int nr_blocks = dmz_bio_blocks(bio); 364 sector_t chunk_block = dmz_chunk_block(zmd, block); 365 int ret = 0; 366 367 /* For unmapped chunks, there is nothing to do */ 368 if (!zone) 369 return 0; 370 371 if (dmz_is_readonly(zone)) 372 return -EROFS; 373 374 DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks", 375 dmz_metadata_label(dmz->metadata), 376 (unsigned long long)dmz_bio_chunk(zmd, bio), 377 zone->id, 378 (unsigned long long)chunk_block, nr_blocks); 379 380 /* 381 * Invalidate blocks in the data zone and its 382 * buffer zone if one is mapped. 383 */ 384 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 385 chunk_block < zone->wp_block) 386 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); 387 if (ret == 0 && zone->bzone) 388 ret = dmz_invalidate_blocks(zmd, zone->bzone, 389 chunk_block, nr_blocks); 390 return ret; 391 } 392 393 /* 394 * Process a BIO. 395 */ 396 static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, 397 struct bio *bio) 398 { 399 struct dmz_bioctx *bioctx = 400 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 401 struct dmz_metadata *zmd = dmz->metadata; 402 struct dm_zone *zone; 403 int ret; 404 405 dmz_lock_metadata(zmd); 406 407 /* 408 * Get the data zone mapping the chunk. There may be no 409 * mapping for read and discard. If a mapping is obtained, 410 + the zone returned will be set to active state. 411 */ 412 zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio), 413 bio_op(bio)); 414 if (IS_ERR(zone)) { 415 ret = PTR_ERR(zone); 416 goto out; 417 } 418 419 /* Process the BIO */ 420 if (zone) { 421 dmz_activate_zone(zone); 422 bioctx->zone = zone; 423 dmz_reclaim_bio_acc(zone->dev->reclaim); 424 } 425 426 switch (bio_op(bio)) { 427 case REQ_OP_READ: 428 ret = dmz_handle_read(dmz, zone, bio); 429 break; 430 case REQ_OP_WRITE: 431 ret = dmz_handle_write(dmz, zone, bio); 432 break; 433 case REQ_OP_DISCARD: 434 case REQ_OP_WRITE_ZEROES: 435 ret = dmz_handle_discard(dmz, zone, bio); 436 break; 437 default: 438 DMERR("(%s): Unsupported BIO operation 0x%x", 439 dmz_metadata_label(dmz->metadata), bio_op(bio)); 440 ret = -EIO; 441 } 442 443 /* 444 * Release the chunk mapping. This will check that the mapping 445 * is still valid, that is, that the zone used still has valid blocks. 446 */ 447 if (zone) 448 dmz_put_chunk_mapping(zmd, zone); 449 out: 450 dmz_bio_endio(bio, errno_to_blk_status(ret)); 451 452 dmz_unlock_metadata(zmd); 453 } 454 455 /* 456 * Increment a chunk reference counter. 457 */ 458 static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) 459 { 460 refcount_inc(&cw->refcount); 461 } 462 463 /* 464 * Decrement a chunk work reference count and 465 * free it if it becomes 0. 466 */ 467 static void dmz_put_chunk_work(struct dm_chunk_work *cw) 468 { 469 if (refcount_dec_and_test(&cw->refcount)) { 470 WARN_ON(!bio_list_empty(&cw->bio_list)); 471 radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk); 472 kfree(cw); 473 } 474 } 475 476 /* 477 * Chunk BIO work function. 478 */ 479 static void dmz_chunk_work(struct work_struct *work) 480 { 481 struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work); 482 struct dmz_target *dmz = cw->target; 483 struct bio *bio; 484 485 mutex_lock(&dmz->chunk_lock); 486 487 /* Process the chunk BIOs */ 488 while ((bio = bio_list_pop(&cw->bio_list))) { 489 mutex_unlock(&dmz->chunk_lock); 490 dmz_handle_bio(dmz, cw, bio); 491 mutex_lock(&dmz->chunk_lock); 492 dmz_put_chunk_work(cw); 493 } 494 495 /* Queueing the work incremented the work refcount */ 496 dmz_put_chunk_work(cw); 497 498 mutex_unlock(&dmz->chunk_lock); 499 } 500 501 /* 502 * Flush work. 503 */ 504 static void dmz_flush_work(struct work_struct *work) 505 { 506 struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work); 507 struct bio *bio; 508 int ret; 509 510 /* Flush dirty metadata blocks */ 511 ret = dmz_flush_metadata(dmz->metadata); 512 if (ret) 513 DMDEBUG("(%s): Metadata flush failed, rc=%d", 514 dmz_metadata_label(dmz->metadata), ret); 515 516 /* Process queued flush requests */ 517 while (1) { 518 spin_lock(&dmz->flush_lock); 519 bio = bio_list_pop(&dmz->flush_list); 520 spin_unlock(&dmz->flush_lock); 521 522 if (!bio) 523 break; 524 525 dmz_bio_endio(bio, errno_to_blk_status(ret)); 526 } 527 528 queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 529 } 530 531 /* 532 * Get a chunk work and start it to process a new BIO. 533 * If the BIO chunk has no work yet, create one. 534 */ 535 static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) 536 { 537 unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio); 538 struct dm_chunk_work *cw; 539 int ret = 0; 540 541 mutex_lock(&dmz->chunk_lock); 542 543 /* Get the BIO chunk work. If one is not active yet, create one */ 544 cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); 545 if (cw) { 546 dmz_get_chunk_work(cw); 547 } else { 548 /* Create a new chunk work */ 549 cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); 550 if (unlikely(!cw)) { 551 ret = -ENOMEM; 552 goto out; 553 } 554 555 INIT_WORK(&cw->work, dmz_chunk_work); 556 refcount_set(&cw->refcount, 1); 557 cw->target = dmz; 558 cw->chunk = chunk; 559 bio_list_init(&cw->bio_list); 560 561 ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); 562 if (unlikely(ret)) { 563 kfree(cw); 564 goto out; 565 } 566 } 567 568 bio_list_add(&cw->bio_list, bio); 569 570 if (queue_work(dmz->chunk_wq, &cw->work)) 571 dmz_get_chunk_work(cw); 572 out: 573 mutex_unlock(&dmz->chunk_lock); 574 return ret; 575 } 576 577 /* 578 * Check if the backing device is being removed. If it's on the way out, 579 * start failing I/O. Reclaim and metadata components also call this 580 * function to cleanly abort operation in the event of such failure. 581 */ 582 bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) 583 { 584 if (dmz_dev->flags & DMZ_BDEV_DYING) 585 return true; 586 587 if (dmz_dev->flags & DMZ_CHECK_BDEV) 588 return !dmz_check_bdev(dmz_dev); 589 590 if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { 591 dmz_dev_warn(dmz_dev, "Backing device queue dying"); 592 dmz_dev->flags |= DMZ_BDEV_DYING; 593 } 594 595 return dmz_dev->flags & DMZ_BDEV_DYING; 596 } 597 598 /* 599 * Check the backing device availability. This detects such events as 600 * backing device going offline due to errors, media removals, etc. 601 * This check is less efficient than dmz_bdev_is_dying() and should 602 * only be performed as a part of error handling. 603 */ 604 bool dmz_check_bdev(struct dmz_dev *dmz_dev) 605 { 606 struct gendisk *disk; 607 608 dmz_dev->flags &= ~DMZ_CHECK_BDEV; 609 610 if (dmz_bdev_is_dying(dmz_dev)) 611 return false; 612 613 disk = dmz_dev->bdev->bd_disk; 614 if (disk->fops->check_events && 615 disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { 616 dmz_dev_warn(dmz_dev, "Backing device offline"); 617 dmz_dev->flags |= DMZ_BDEV_DYING; 618 } 619 620 return !(dmz_dev->flags & DMZ_BDEV_DYING); 621 } 622 623 /* 624 * Process a new BIO. 625 */ 626 static int dmz_map(struct dm_target *ti, struct bio *bio) 627 { 628 struct dmz_target *dmz = ti->private; 629 struct dmz_metadata *zmd = dmz->metadata; 630 struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 631 sector_t sector = bio->bi_iter.bi_sector; 632 unsigned int nr_sectors = bio_sectors(bio); 633 sector_t chunk_sector; 634 int ret; 635 636 if (dmz_dev_is_dying(zmd)) 637 return DM_MAPIO_KILL; 638 639 DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", 640 dmz_metadata_label(zmd), 641 bio_op(bio), (unsigned long long)sector, nr_sectors, 642 (unsigned long long)dmz_bio_chunk(zmd, bio), 643 (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), 644 (unsigned int)dmz_bio_blocks(bio)); 645 646 if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) 647 return DM_MAPIO_REMAPPED; 648 649 /* The BIO should be block aligned */ 650 if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK)) 651 return DM_MAPIO_KILL; 652 653 /* Initialize the BIO context */ 654 bioctx->dev = NULL; 655 bioctx->zone = NULL; 656 bioctx->bio = bio; 657 refcount_set(&bioctx->ref, 1); 658 659 /* Set the BIO pending in the flush list */ 660 if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { 661 spin_lock(&dmz->flush_lock); 662 bio_list_add(&dmz->flush_list, bio); 663 spin_unlock(&dmz->flush_lock); 664 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0); 665 return DM_MAPIO_SUBMITTED; 666 } 667 668 /* Split zone BIOs to fit entirely into a zone */ 669 chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); 670 if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd)) 671 dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector); 672 673 /* Now ready to handle this BIO */ 674 ret = dmz_queue_chunk_work(dmz, bio); 675 if (ret) { 676 DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i", 677 dmz_metadata_label(zmd), 678 bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), 679 ret); 680 return DM_MAPIO_REQUEUE; 681 } 682 683 return DM_MAPIO_SUBMITTED; 684 } 685 686 /* 687 * Get zoned device information. 688 */ 689 static int dmz_get_zoned_device(struct dm_target *ti, char *path, 690 int idx, int nr_devs) 691 { 692 struct dmz_target *dmz = ti->private; 693 struct dm_dev *ddev; 694 struct dmz_dev *dev; 695 int ret; 696 struct block_device *bdev; 697 698 /* Get the target device */ 699 ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); 700 if (ret) { 701 ti->error = "Get target device failed"; 702 return ret; 703 } 704 705 bdev = ddev->bdev; 706 if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { 707 if (nr_devs == 1) { 708 ti->error = "Invalid regular device"; 709 goto err; 710 } 711 if (idx != 0) { 712 ti->error = "First device must be a regular device"; 713 goto err; 714 } 715 if (dmz->ddev[0]) { 716 ti->error = "Too many regular devices"; 717 goto err; 718 } 719 dev = &dmz->dev[idx]; 720 dev->flags = DMZ_BDEV_REGULAR; 721 } else { 722 if (dmz->ddev[idx]) { 723 ti->error = "Too many zoned devices"; 724 goto err; 725 } 726 if (nr_devs > 1 && idx == 0) { 727 ti->error = "First device must be a regular device"; 728 goto err; 729 } 730 dev = &dmz->dev[idx]; 731 } 732 dev->bdev = bdev; 733 dev->dev_idx = idx; 734 (void)bdevname(dev->bdev, dev->name); 735 736 dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 737 if (ti->begin) { 738 ti->error = "Partial mapping is not supported"; 739 goto err; 740 } 741 742 dmz->ddev[idx] = ddev; 743 744 return 0; 745 err: 746 dm_put_device(ti, ddev); 747 return -EINVAL; 748 } 749 750 /* 751 * Cleanup zoned device information. 752 */ 753 static void dmz_put_zoned_device(struct dm_target *ti) 754 { 755 struct dmz_target *dmz = ti->private; 756 int i; 757 758 for (i = 0; i < dmz->nr_ddevs; i++) { 759 if (dmz->ddev[i]) { 760 dm_put_device(ti, dmz->ddev[i]); 761 dmz->ddev[i] = NULL; 762 } 763 } 764 } 765 766 static int dmz_fixup_devices(struct dm_target *ti) 767 { 768 struct dmz_target *dmz = ti->private; 769 struct dmz_dev *reg_dev, *zoned_dev; 770 struct request_queue *q; 771 sector_t zone_nr_sectors = 0; 772 int i; 773 774 /* 775 * When we have more than on devices, the first one must be a 776 * regular block device and the others zoned block devices. 777 */ 778 if (dmz->nr_ddevs > 1) { 779 reg_dev = &dmz->dev[0]; 780 if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { 781 ti->error = "Primary disk is not a regular device"; 782 return -EINVAL; 783 } 784 for (i = 1; i < dmz->nr_ddevs; i++) { 785 zoned_dev = &dmz->dev[i]; 786 if (zoned_dev->flags & DMZ_BDEV_REGULAR) { 787 ti->error = "Secondary disk is not a zoned device"; 788 return -EINVAL; 789 } 790 q = bdev_get_queue(zoned_dev->bdev); 791 if (zone_nr_sectors && 792 zone_nr_sectors != blk_queue_zone_sectors(q)) { 793 ti->error = "Zone nr sectors mismatch"; 794 return -EINVAL; 795 } 796 zone_nr_sectors = blk_queue_zone_sectors(q); 797 zoned_dev->zone_nr_sectors = zone_nr_sectors; 798 zoned_dev->nr_zones = 799 blkdev_nr_zones(zoned_dev->bdev->bd_disk); 800 } 801 } else { 802 reg_dev = NULL; 803 zoned_dev = &dmz->dev[0]; 804 if (zoned_dev->flags & DMZ_BDEV_REGULAR) { 805 ti->error = "Disk is not a zoned device"; 806 return -EINVAL; 807 } 808 q = bdev_get_queue(zoned_dev->bdev); 809 zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); 810 zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); 811 } 812 813 if (reg_dev) { 814 sector_t zone_offset; 815 816 reg_dev->zone_nr_sectors = zone_nr_sectors; 817 reg_dev->nr_zones = 818 DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, 819 reg_dev->zone_nr_sectors); 820 reg_dev->zone_offset = 0; 821 zone_offset = reg_dev->nr_zones; 822 for (i = 1; i < dmz->nr_ddevs; i++) { 823 dmz->dev[i].zone_offset = zone_offset; 824 zone_offset += dmz->dev[i].nr_zones; 825 } 826 } 827 return 0; 828 } 829 830 /* 831 * Setup target. 832 */ 833 static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) 834 { 835 struct dmz_target *dmz; 836 int ret, i; 837 838 /* Check arguments */ 839 if (argc < 1) { 840 ti->error = "Invalid argument count"; 841 return -EINVAL; 842 } 843 844 /* Allocate and initialize the target descriptor */ 845 dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL); 846 if (!dmz) { 847 ti->error = "Unable to allocate the zoned target descriptor"; 848 return -ENOMEM; 849 } 850 dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); 851 if (!dmz->dev) { 852 ti->error = "Unable to allocate the zoned device descriptors"; 853 kfree(dmz); 854 return -ENOMEM; 855 } 856 dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); 857 if (!dmz->ddev) { 858 ti->error = "Unable to allocate the dm device descriptors"; 859 ret = -ENOMEM; 860 goto err; 861 } 862 dmz->nr_ddevs = argc; 863 864 ti->private = dmz; 865 866 /* Get the target zoned block device */ 867 for (i = 0; i < argc; i++) { 868 ret = dmz_get_zoned_device(ti, argv[i], i, argc); 869 if (ret) 870 goto err_dev; 871 } 872 ret = dmz_fixup_devices(ti); 873 if (ret) 874 goto err_dev; 875 876 /* Initialize metadata */ 877 ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, 878 dm_table_device_name(ti->table)); 879 if (ret) { 880 ti->error = "Metadata initialization failed"; 881 goto err_dev; 882 } 883 884 /* Set target (no write same support) */ 885 ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata); 886 ti->num_flush_bios = 1; 887 ti->num_discard_bios = 1; 888 ti->num_write_zeroes_bios = 1; 889 ti->per_io_data_size = sizeof(struct dmz_bioctx); 890 ti->flush_supported = true; 891 ti->discards_supported = true; 892 893 /* The exposed capacity is the number of chunks that can be mapped */ 894 ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << 895 dmz_zone_nr_sectors_shift(dmz->metadata); 896 897 /* Zone BIO */ 898 ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); 899 if (ret) { 900 ti->error = "Create BIO set failed"; 901 goto err_meta; 902 } 903 904 /* Chunk BIO work */ 905 mutex_init(&dmz->chunk_lock); 906 INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); 907 dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", 908 WQ_MEM_RECLAIM | WQ_UNBOUND, 0, 909 dmz_metadata_label(dmz->metadata)); 910 if (!dmz->chunk_wq) { 911 ti->error = "Create chunk workqueue failed"; 912 ret = -ENOMEM; 913 goto err_bio; 914 } 915 916 /* Flush work */ 917 spin_lock_init(&dmz->flush_lock); 918 bio_list_init(&dmz->flush_list); 919 INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work); 920 dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM, 921 dmz_metadata_label(dmz->metadata)); 922 if (!dmz->flush_wq) { 923 ti->error = "Create flush workqueue failed"; 924 ret = -ENOMEM; 925 goto err_cwq; 926 } 927 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 928 929 /* Initialize reclaim */ 930 for (i = 0; i < dmz->nr_ddevs; i++) { 931 ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i); 932 if (ret) { 933 ti->error = "Zone reclaim initialization failed"; 934 goto err_fwq; 935 } 936 } 937 938 DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", 939 dmz_metadata_label(dmz->metadata), 940 (unsigned long long)ti->len, 941 (unsigned long long)dmz_sect2blk(ti->len)); 942 943 return 0; 944 err_fwq: 945 destroy_workqueue(dmz->flush_wq); 946 err_cwq: 947 destroy_workqueue(dmz->chunk_wq); 948 err_bio: 949 mutex_destroy(&dmz->chunk_lock); 950 bioset_exit(&dmz->bio_set); 951 err_meta: 952 dmz_dtr_metadata(dmz->metadata); 953 err_dev: 954 dmz_put_zoned_device(ti); 955 err: 956 kfree(dmz->dev); 957 kfree(dmz); 958 959 return ret; 960 } 961 962 /* 963 * Cleanup target. 964 */ 965 static void dmz_dtr(struct dm_target *ti) 966 { 967 struct dmz_target *dmz = ti->private; 968 int i; 969 970 flush_workqueue(dmz->chunk_wq); 971 destroy_workqueue(dmz->chunk_wq); 972 973 for (i = 0; i < dmz->nr_ddevs; i++) 974 dmz_dtr_reclaim(dmz->dev[i].reclaim); 975 976 cancel_delayed_work_sync(&dmz->flush_work); 977 destroy_workqueue(dmz->flush_wq); 978 979 (void) dmz_flush_metadata(dmz->metadata); 980 981 dmz_dtr_metadata(dmz->metadata); 982 983 bioset_exit(&dmz->bio_set); 984 985 dmz_put_zoned_device(ti); 986 987 mutex_destroy(&dmz->chunk_lock); 988 989 kfree(dmz->dev); 990 kfree(dmz); 991 } 992 993 /* 994 * Setup target request queue limits. 995 */ 996 static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) 997 { 998 struct dmz_target *dmz = ti->private; 999 unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata); 1000 1001 limits->logical_block_size = DMZ_BLOCK_SIZE; 1002 limits->physical_block_size = DMZ_BLOCK_SIZE; 1003 1004 blk_limits_io_min(limits, DMZ_BLOCK_SIZE); 1005 blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); 1006 1007 limits->discard_alignment = DMZ_BLOCK_SIZE; 1008 limits->discard_granularity = DMZ_BLOCK_SIZE; 1009 limits->max_discard_sectors = chunk_sectors; 1010 limits->max_hw_discard_sectors = chunk_sectors; 1011 limits->max_write_zeroes_sectors = chunk_sectors; 1012 1013 /* FS hint to try to align to the device zone size */ 1014 limits->chunk_sectors = chunk_sectors; 1015 limits->max_sectors = chunk_sectors; 1016 1017 /* We are exposing a drive-managed zoned block device */ 1018 limits->zoned = BLK_ZONED_NONE; 1019 } 1020 1021 /* 1022 * Pass on ioctl to the backend device. 1023 */ 1024 static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 1025 { 1026 struct dmz_target *dmz = ti->private; 1027 struct dmz_dev *dev = &dmz->dev[0]; 1028 1029 if (!dmz_check_bdev(dev)) 1030 return -EIO; 1031 1032 *bdev = dev->bdev; 1033 1034 return 0; 1035 } 1036 1037 /* 1038 * Stop works on suspend. 1039 */ 1040 static void dmz_suspend(struct dm_target *ti) 1041 { 1042 struct dmz_target *dmz = ti->private; 1043 int i; 1044 1045 flush_workqueue(dmz->chunk_wq); 1046 for (i = 0; i < dmz->nr_ddevs; i++) 1047 dmz_suspend_reclaim(dmz->dev[i].reclaim); 1048 cancel_delayed_work_sync(&dmz->flush_work); 1049 } 1050 1051 /* 1052 * Restart works on resume or if suspend failed. 1053 */ 1054 static void dmz_resume(struct dm_target *ti) 1055 { 1056 struct dmz_target *dmz = ti->private; 1057 int i; 1058 1059 queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 1060 for (i = 0; i < dmz->nr_ddevs; i++) 1061 dmz_resume_reclaim(dmz->dev[i].reclaim); 1062 } 1063 1064 static int dmz_iterate_devices(struct dm_target *ti, 1065 iterate_devices_callout_fn fn, void *data) 1066 { 1067 struct dmz_target *dmz = ti->private; 1068 unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); 1069 sector_t capacity; 1070 int i, r; 1071 1072 for (i = 0; i < dmz->nr_ddevs; i++) { 1073 capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); 1074 r = fn(ti, dmz->ddev[i], 0, capacity, data); 1075 if (r) 1076 break; 1077 } 1078 return r; 1079 } 1080 1081 static void dmz_status(struct dm_target *ti, status_type_t type, 1082 unsigned int status_flags, char *result, 1083 unsigned int maxlen) 1084 { 1085 struct dmz_target *dmz = ti->private; 1086 ssize_t sz = 0; 1087 char buf[BDEVNAME_SIZE]; 1088 struct dmz_dev *dev; 1089 int i; 1090 1091 switch (type) { 1092 case STATUSTYPE_INFO: 1093 DMEMIT("%u zones %u/%u cache", 1094 dmz_nr_zones(dmz->metadata), 1095 dmz_nr_unmap_cache_zones(dmz->metadata), 1096 dmz_nr_cache_zones(dmz->metadata)); 1097 for (i = 0; i < dmz->nr_ddevs; i++) { 1098 /* 1099 * For a multi-device setup the first device 1100 * contains only cache zones. 1101 */ 1102 if ((i == 0) && 1103 (dmz_nr_cache_zones(dmz->metadata) > 0)) 1104 continue; 1105 DMEMIT(" %u/%u random %u/%u sequential", 1106 dmz_nr_unmap_rnd_zones(dmz->metadata, i), 1107 dmz_nr_rnd_zones(dmz->metadata, i), 1108 dmz_nr_unmap_seq_zones(dmz->metadata, i), 1109 dmz_nr_seq_zones(dmz->metadata, i)); 1110 } 1111 break; 1112 case STATUSTYPE_TABLE: 1113 dev = &dmz->dev[0]; 1114 format_dev_t(buf, dev->bdev->bd_dev); 1115 DMEMIT("%s", buf); 1116 for (i = 1; i < dmz->nr_ddevs; i++) { 1117 dev = &dmz->dev[i]; 1118 format_dev_t(buf, dev->bdev->bd_dev); 1119 DMEMIT(" %s", buf); 1120 } 1121 break; 1122 case STATUSTYPE_IMA: 1123 *result = '\0'; 1124 break; 1125 } 1126 return; 1127 } 1128 1129 static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, 1130 char *result, unsigned int maxlen) 1131 { 1132 struct dmz_target *dmz = ti->private; 1133 int r = -EINVAL; 1134 1135 if (!strcasecmp(argv[0], "reclaim")) { 1136 int i; 1137 1138 for (i = 0; i < dmz->nr_ddevs; i++) 1139 dmz_schedule_reclaim(dmz->dev[i].reclaim); 1140 r = 0; 1141 } else 1142 DMERR("unrecognized message %s", argv[0]); 1143 return r; 1144 } 1145 1146 static struct target_type dmz_type = { 1147 .name = "zoned", 1148 .version = {2, 0, 0}, 1149 .features = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL, 1150 .module = THIS_MODULE, 1151 .ctr = dmz_ctr, 1152 .dtr = dmz_dtr, 1153 .map = dmz_map, 1154 .io_hints = dmz_io_hints, 1155 .prepare_ioctl = dmz_prepare_ioctl, 1156 .postsuspend = dmz_suspend, 1157 .resume = dmz_resume, 1158 .iterate_devices = dmz_iterate_devices, 1159 .status = dmz_status, 1160 .message = dmz_message, 1161 }; 1162 1163 static int __init dmz_init(void) 1164 { 1165 return dm_register_target(&dmz_type); 1166 } 1167 1168 static void __exit dmz_exit(void) 1169 { 1170 dm_unregister_target(&dmz_type); 1171 } 1172 1173 module_init(dmz_init); 1174 module_exit(dmz_exit); 1175 1176 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices"); 1177 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>"); 1178 MODULE_LICENSE("GPL"); 1179