1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2021 Western Digital Corporation or its affiliates. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/mm.h> 8 #include <linux/sched/mm.h> 9 #include <linux/slab.h> 10 #include <linux/bitmap.h> 11 12 #include "dm-core.h" 13 14 #define DM_MSG_PREFIX "zone" 15 16 #define DM_ZONE_INVALID_WP_OFST UINT_MAX 17 18 /* 19 * For internal zone reports bypassing the top BIO submission path. 20 */ 21 static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, 22 sector_t sector, unsigned int nr_zones, 23 report_zones_cb cb, void *data) 24 { 25 struct gendisk *disk = md->disk; 26 int ret; 27 struct dm_report_zones_args args = { 28 .next_sector = sector, 29 .orig_data = data, 30 .orig_cb = cb, 31 }; 32 33 do { 34 struct dm_target *tgt; 35 36 tgt = dm_table_find_target(t, args.next_sector); 37 if (WARN_ON_ONCE(!tgt->type->report_zones)) 38 return -EIO; 39 40 args.tgt = tgt; 41 ret = tgt->type->report_zones(tgt, &args, 42 nr_zones - args.zone_idx); 43 if (ret < 0) 44 return ret; 45 } while (args.zone_idx < nr_zones && 46 args.next_sector < get_capacity(disk)); 47 48 return args.zone_idx; 49 } 50 51 /* 52 * User facing dm device block device report zone operation. This calls the 53 * report_zones operation for each target of a device table. This operation is 54 * generally implemented by targets using dm_report_zones(). 55 */ 56 int dm_blk_report_zones(struct gendisk *disk, sector_t sector, 57 unsigned int nr_zones, report_zones_cb cb, void *data) 58 { 59 struct mapped_device *md = disk->private_data; 60 struct dm_table *map; 61 int srcu_idx, ret; 62 63 if (dm_suspended_md(md)) 64 return -EAGAIN; 65 66 map = dm_get_live_table(md, &srcu_idx); 67 if (!map) 68 return -EIO; 69 70 ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); 71 72 dm_put_live_table(md, srcu_idx); 73 74 return ret; 75 } 76 77 static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, 78 void *data) 79 { 80 struct dm_report_zones_args *args = data; 81 sector_t sector_diff = args->tgt->begin - args->start; 82 83 /* 84 * Ignore zones beyond the target range. 85 */ 86 if (zone->start >= args->start + args->tgt->len) 87 return 0; 88 89 /* 90 * Remap the start sector and write pointer position of the zone 91 * to match its position in the target range. 92 */ 93 zone->start += sector_diff; 94 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { 95 if (zone->cond == BLK_ZONE_COND_FULL) 96 zone->wp = zone->start + zone->len; 97 else if (zone->cond == BLK_ZONE_COND_EMPTY) 98 zone->wp = zone->start; 99 else 100 zone->wp += sector_diff; 101 } 102 103 args->next_sector = zone->start + zone->len; 104 return args->orig_cb(zone, args->zone_idx++, args->orig_data); 105 } 106 107 /* 108 * Helper for drivers of zoned targets to implement struct target_type 109 * report_zones operation. 110 */ 111 int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector, 112 struct dm_report_zones_args *args, unsigned int nr_zones) 113 { 114 /* 115 * Set the target mapping start sector first so that 116 * dm_report_zones_cb() can correctly remap zone information. 117 */ 118 args->start = start; 119 120 return blkdev_report_zones(bdev, sector, nr_zones, 121 dm_report_zones_cb, args); 122 } 123 EXPORT_SYMBOL_GPL(dm_report_zones); 124 125 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) 126 { 127 struct request_queue *q = md->queue; 128 129 if (!blk_queue_is_zoned(q)) 130 return false; 131 132 switch (bio_op(bio)) { 133 case REQ_OP_WRITE_ZEROES: 134 case REQ_OP_WRITE: 135 return !op_is_flush(bio->bi_opf) && bio_sectors(bio); 136 default: 137 return false; 138 } 139 } 140 141 void dm_cleanup_zoned_dev(struct mapped_device *md) 142 { 143 if (md->disk) { 144 bitmap_free(md->disk->conv_zones_bitmap); 145 md->disk->conv_zones_bitmap = NULL; 146 bitmap_free(md->disk->seq_zones_wlock); 147 md->disk->seq_zones_wlock = NULL; 148 } 149 150 kvfree(md->zwp_offset); 151 md->zwp_offset = NULL; 152 md->nr_zones = 0; 153 } 154 155 static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) 156 { 157 switch (zone->cond) { 158 case BLK_ZONE_COND_IMP_OPEN: 159 case BLK_ZONE_COND_EXP_OPEN: 160 case BLK_ZONE_COND_CLOSED: 161 return zone->wp - zone->start; 162 case BLK_ZONE_COND_FULL: 163 return zone->len; 164 case BLK_ZONE_COND_EMPTY: 165 case BLK_ZONE_COND_NOT_WP: 166 case BLK_ZONE_COND_OFFLINE: 167 case BLK_ZONE_COND_READONLY: 168 default: 169 /* 170 * Conventional, offline and read-only zones do not have a valid 171 * write pointer. Use 0 as for an empty zone. 172 */ 173 return 0; 174 } 175 } 176 177 static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, 178 void *data) 179 { 180 struct mapped_device *md = data; 181 struct gendisk *disk = md->disk; 182 183 switch (zone->type) { 184 case BLK_ZONE_TYPE_CONVENTIONAL: 185 if (!disk->conv_zones_bitmap) { 186 disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones, 187 GFP_NOIO); 188 if (!disk->conv_zones_bitmap) 189 return -ENOMEM; 190 } 191 set_bit(idx, disk->conv_zones_bitmap); 192 break; 193 case BLK_ZONE_TYPE_SEQWRITE_REQ: 194 case BLK_ZONE_TYPE_SEQWRITE_PREF: 195 if (!disk->seq_zones_wlock) { 196 disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones, 197 GFP_NOIO); 198 if (!disk->seq_zones_wlock) 199 return -ENOMEM; 200 } 201 if (!md->zwp_offset) { 202 md->zwp_offset = 203 kvcalloc(disk->nr_zones, sizeof(unsigned int), 204 GFP_KERNEL); 205 if (!md->zwp_offset) 206 return -ENOMEM; 207 } 208 md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); 209 210 break; 211 default: 212 DMERR("Invalid zone type 0x%x at sectors %llu", 213 (int)zone->type, zone->start); 214 return -ENODEV; 215 } 216 217 return 0; 218 } 219 220 /* 221 * Revalidate the zones of a mapped device to initialize resource necessary 222 * for zone append emulation. Note that we cannot simply use the block layer 223 * blk_revalidate_disk_zones() function here as the mapped device is suspended 224 * (this is called from __bind() context). 225 */ 226 static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) 227 { 228 struct gendisk *disk = md->disk; 229 unsigned int noio_flag; 230 int ret; 231 232 /* 233 * Check if something changed. If yes, cleanup the current resources 234 * and reallocate everything. 235 */ 236 if (!disk->nr_zones || disk->nr_zones != md->nr_zones) 237 dm_cleanup_zoned_dev(md); 238 if (md->nr_zones) 239 return 0; 240 241 /* 242 * Scan all zones to initialize everything. Ensure that all vmalloc 243 * operations in this context are done as if GFP_NOIO was specified. 244 */ 245 noio_flag = memalloc_noio_save(); 246 ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones, 247 dm_zone_revalidate_cb, md); 248 memalloc_noio_restore(noio_flag); 249 if (ret < 0) 250 goto err; 251 if (ret != disk->nr_zones) { 252 ret = -EIO; 253 goto err; 254 } 255 256 md->nr_zones = disk->nr_zones; 257 258 return 0; 259 260 err: 261 DMERR("Revalidate zones failed %d", ret); 262 dm_cleanup_zoned_dev(md); 263 return ret; 264 } 265 266 static int device_not_zone_append_capable(struct dm_target *ti, 267 struct dm_dev *dev, sector_t start, 268 sector_t len, void *data) 269 { 270 return !bdev_is_zoned(dev->bdev); 271 } 272 273 static bool dm_table_supports_zone_append(struct dm_table *t) 274 { 275 for (unsigned int i = 0; i < t->num_targets; i++) { 276 struct dm_target *ti = dm_table_get_target(t, i); 277 278 if (ti->emulate_zone_append) 279 return false; 280 281 if (!ti->type->iterate_devices || 282 ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL)) 283 return false; 284 } 285 286 return true; 287 } 288 289 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) 290 { 291 struct mapped_device *md = t->md; 292 293 /* 294 * For a zoned target, the number of zones should be updated for the 295 * correct value to be exposed in sysfs queue/nr_zones. 296 */ 297 WARN_ON_ONCE(queue_is_mq(q)); 298 md->disk->nr_zones = bdev_nr_zones(md->disk->part0); 299 300 /* Check if zone append is natively supported */ 301 if (dm_table_supports_zone_append(t)) { 302 clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 303 dm_cleanup_zoned_dev(md); 304 return 0; 305 } 306 307 /* 308 * Mark the mapped device as needing zone append emulation and 309 * initialize the emulation resources once the capacity is set. 310 */ 311 set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 312 if (!get_capacity(md->disk)) 313 return 0; 314 315 return dm_revalidate_zones(md, t); 316 } 317 318 static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, 319 void *data) 320 { 321 unsigned int *wp_offset = data; 322 323 *wp_offset = dm_get_zone_wp_offset(zone); 324 325 return 0; 326 } 327 328 static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, 329 unsigned int *wp_ofst) 330 { 331 sector_t sector = zno * bdev_zone_sectors(md->disk->part0); 332 unsigned int noio_flag; 333 struct dm_table *t; 334 int srcu_idx, ret; 335 336 t = dm_get_live_table(md, &srcu_idx); 337 if (!t) 338 return -EIO; 339 340 /* 341 * Ensure that all memory allocations in this context are done as if 342 * GFP_NOIO was specified. 343 */ 344 noio_flag = memalloc_noio_save(); 345 ret = dm_blk_do_report_zones(md, t, sector, 1, 346 dm_update_zone_wp_offset_cb, wp_ofst); 347 memalloc_noio_restore(noio_flag); 348 349 dm_put_live_table(md, srcu_idx); 350 351 if (ret != 1) 352 return -EIO; 353 354 return 0; 355 } 356 357 struct orig_bio_details { 358 enum req_op op; 359 unsigned int nr_sectors; 360 }; 361 362 /* 363 * First phase of BIO mapping for targets with zone append emulation: 364 * check all BIO that change a zone writer pointer and change zone 365 * append operations into regular write operations. 366 */ 367 static bool dm_zone_map_bio_begin(struct mapped_device *md, 368 unsigned int zno, struct bio *clone) 369 { 370 sector_t zsectors = bdev_zone_sectors(md->disk->part0); 371 unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); 372 373 /* 374 * If the target zone is in an error state, recover by inspecting the 375 * zone to get its current write pointer position. Note that since the 376 * target zone is already locked, a BIO issuing context should never 377 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. 378 */ 379 if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { 380 if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) 381 return false; 382 WRITE_ONCE(md->zwp_offset[zno], zwp_offset); 383 } 384 385 switch (bio_op(clone)) { 386 case REQ_OP_ZONE_RESET: 387 case REQ_OP_ZONE_FINISH: 388 return true; 389 case REQ_OP_WRITE_ZEROES: 390 case REQ_OP_WRITE: 391 /* Writes must be aligned to the zone write pointer */ 392 if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) 393 return false; 394 break; 395 case REQ_OP_ZONE_APPEND: 396 /* 397 * Change zone append operations into a non-mergeable regular 398 * writes directed at the current write pointer position of the 399 * target zone. 400 */ 401 clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | 402 (clone->bi_opf & (~REQ_OP_MASK)); 403 clone->bi_iter.bi_sector += zwp_offset; 404 break; 405 default: 406 DMWARN_LIMIT("Invalid BIO operation"); 407 return false; 408 } 409 410 /* Cannot write to a full zone */ 411 if (zwp_offset >= zsectors) 412 return false; 413 414 return true; 415 } 416 417 /* 418 * Second phase of BIO mapping for targets with zone append emulation: 419 * update the zone write pointer offset array to account for the additional 420 * data written to a zone. Note that at this point, the remapped clone BIO 421 * may already have completed, so we do not touch it. 422 */ 423 static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno, 424 struct orig_bio_details *orig_bio_details, 425 unsigned int nr_sectors) 426 { 427 unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); 428 429 /* The clone BIO may already have been completed and failed */ 430 if (zwp_offset == DM_ZONE_INVALID_WP_OFST) 431 return BLK_STS_IOERR; 432 433 /* Update the zone wp offset */ 434 switch (orig_bio_details->op) { 435 case REQ_OP_ZONE_RESET: 436 WRITE_ONCE(md->zwp_offset[zno], 0); 437 return BLK_STS_OK; 438 case REQ_OP_ZONE_FINISH: 439 WRITE_ONCE(md->zwp_offset[zno], 440 bdev_zone_sectors(md->disk->part0)); 441 return BLK_STS_OK; 442 case REQ_OP_WRITE_ZEROES: 443 case REQ_OP_WRITE: 444 WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); 445 return BLK_STS_OK; 446 case REQ_OP_ZONE_APPEND: 447 /* 448 * Check that the target did not truncate the write operation 449 * emulating a zone append. 450 */ 451 if (nr_sectors != orig_bio_details->nr_sectors) { 452 DMWARN_LIMIT("Truncated write for zone append"); 453 return BLK_STS_IOERR; 454 } 455 WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); 456 return BLK_STS_OK; 457 default: 458 DMWARN_LIMIT("Invalid BIO operation"); 459 return BLK_STS_IOERR; 460 } 461 } 462 463 static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno, 464 struct bio *clone) 465 { 466 if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) 467 return; 468 469 wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); 470 bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); 471 } 472 473 static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno, 474 struct bio *clone) 475 { 476 if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) 477 return; 478 479 WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock)); 480 clear_bit_unlock(zno, disk->seq_zones_wlock); 481 smp_mb__after_atomic(); 482 wake_up_bit(disk->seq_zones_wlock, zno); 483 484 bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); 485 } 486 487 static bool dm_need_zone_wp_tracking(struct bio *bio) 488 { 489 /* 490 * Special processing is not needed for operations that do not need the 491 * zone write lock, that is, all operations that target conventional 492 * zones and all operations that do not modify directly a sequential 493 * zone write pointer. 494 */ 495 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) 496 return false; 497 switch (bio_op(bio)) { 498 case REQ_OP_WRITE_ZEROES: 499 case REQ_OP_WRITE: 500 case REQ_OP_ZONE_RESET: 501 case REQ_OP_ZONE_FINISH: 502 case REQ_OP_ZONE_APPEND: 503 return bio_zone_is_seq(bio); 504 default: 505 return false; 506 } 507 } 508 509 /* 510 * Special IO mapping for targets needing zone append emulation. 511 */ 512 int dm_zone_map_bio(struct dm_target_io *tio) 513 { 514 struct dm_io *io = tio->io; 515 struct dm_target *ti = tio->ti; 516 struct mapped_device *md = io->md; 517 struct bio *clone = &tio->clone; 518 struct orig_bio_details orig_bio_details; 519 unsigned int zno; 520 blk_status_t sts; 521 int r; 522 523 /* 524 * IOs that do not change a zone write pointer do not need 525 * any additional special processing. 526 */ 527 if (!dm_need_zone_wp_tracking(clone)) 528 return ti->type->map(ti, clone); 529 530 /* Lock the target zone */ 531 zno = bio_zone_no(clone); 532 dm_zone_lock(md->disk, zno, clone); 533 534 orig_bio_details.nr_sectors = bio_sectors(clone); 535 orig_bio_details.op = bio_op(clone); 536 537 /* 538 * Check that the bio and the target zone write pointer offset are 539 * both valid, and if the bio is a zone append, remap it to a write. 540 */ 541 if (!dm_zone_map_bio_begin(md, zno, clone)) { 542 dm_zone_unlock(md->disk, zno, clone); 543 return DM_MAPIO_KILL; 544 } 545 546 /* Let the target do its work */ 547 r = ti->type->map(ti, clone); 548 switch (r) { 549 case DM_MAPIO_SUBMITTED: 550 /* 551 * The target submitted the clone BIO. The target zone will 552 * be unlocked on completion of the clone. 553 */ 554 sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, 555 *tio->len_ptr); 556 break; 557 case DM_MAPIO_REMAPPED: 558 /* 559 * The target only remapped the clone BIO. In case of error, 560 * unlock the target zone here as the clone will not be 561 * submitted. 562 */ 563 sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, 564 *tio->len_ptr); 565 if (sts != BLK_STS_OK) 566 dm_zone_unlock(md->disk, zno, clone); 567 break; 568 case DM_MAPIO_REQUEUE: 569 case DM_MAPIO_KILL: 570 default: 571 dm_zone_unlock(md->disk, zno, clone); 572 sts = BLK_STS_IOERR; 573 break; 574 } 575 576 if (sts != BLK_STS_OK) 577 return DM_MAPIO_KILL; 578 579 return r; 580 } 581 582 /* 583 * IO completion callback called from clone_endio(). 584 */ 585 void dm_zone_endio(struct dm_io *io, struct bio *clone) 586 { 587 struct mapped_device *md = io->md; 588 struct gendisk *disk = md->disk; 589 struct bio *orig_bio = io->orig_bio; 590 unsigned int zwp_offset; 591 unsigned int zno; 592 593 /* 594 * For targets that do not emulate zone append, we only need to 595 * handle native zone-append bios. 596 */ 597 if (!dm_emulate_zone_append(md)) { 598 /* 599 * Get the offset within the zone of the written sector 600 * and add that to the original bio sector position. 601 */ 602 if (clone->bi_status == BLK_STS_OK && 603 bio_op(clone) == REQ_OP_ZONE_APPEND) { 604 sector_t mask = 605 (sector_t)bdev_zone_sectors(disk->part0) - 1; 606 607 orig_bio->bi_iter.bi_sector += 608 clone->bi_iter.bi_sector & mask; 609 } 610 611 return; 612 } 613 614 /* 615 * For targets that do emulate zone append, if the clone BIO does not 616 * own the target zone write lock, we have nothing to do. 617 */ 618 if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) 619 return; 620 621 zno = bio_zone_no(orig_bio); 622 623 if (clone->bi_status != BLK_STS_OK) { 624 /* 625 * BIOs that modify a zone write pointer may leave the zone 626 * in an unknown state in case of failure (e.g. the write 627 * pointer was only partially advanced). In this case, set 628 * the target zone write pointer as invalid unless it is 629 * already being updated. 630 */ 631 WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); 632 } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { 633 /* 634 * Get the written sector for zone append operation that were 635 * emulated using regular write operations. 636 */ 637 zwp_offset = READ_ONCE(md->zwp_offset[zno]); 638 if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) 639 WRITE_ONCE(md->zwp_offset[zno], 640 DM_ZONE_INVALID_WP_OFST); 641 else 642 orig_bio->bi_iter.bi_sector += 643 zwp_offset - bio_sectors(orig_bio); 644 } 645 646 dm_zone_unlock(disk, zno, clone); 647 } 648