1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe ZNS-ZBD command implementation. 4 * Copyright (C) 2021 Western Digital Corporation or its affiliates. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/nvme.h> 8 #include <linux/blkdev.h> 9 #include "nvmet.h" 10 11 /* 12 * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0 13 * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k 14 * as page_shift value. When calculating the ZASL use shift by 12. 15 */ 16 #define NVMET_MPSMIN_SHIFT 12 17 18 static inline u8 nvmet_zasl(unsigned int zone_append_sects) 19 { 20 /* 21 * Zone Append Size Limit (zasl) is expressed as a power of 2 value 22 * with the minimum memory page size (i.e. 12) as unit. 23 */ 24 return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - 9)); 25 } 26 27 static int validate_conv_zones_cb(struct blk_zone *z, 28 unsigned int i, void *data) 29 { 30 if (z->type == BLK_ZONE_TYPE_CONVENTIONAL) 31 return -EOPNOTSUPP; 32 return 0; 33 } 34 35 bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) 36 { 37 u8 zasl = nvmet_zasl(bdev_max_zone_append_sectors(ns->bdev)); 38 struct gendisk *bd_disk = ns->bdev->bd_disk; 39 int ret; 40 41 if (ns->subsys->zasl) { 42 if (ns->subsys->zasl > zasl) 43 return false; 44 } 45 ns->subsys->zasl = zasl; 46 47 /* 48 * Generic zoned block devices may have a smaller last zone which is 49 * not supported by ZNS. Exclude zoned drives that have such smaller 50 * last zone. 51 */ 52 if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1)) 53 return false; 54 /* 55 * ZNS does not define a conventional zone type. Use report zones 56 * to detect if the device has conventional zones and reject it if 57 * it does. 58 */ 59 ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev), 60 validate_conv_zones_cb, NULL); 61 if (ret < 0) 62 return false; 63 64 ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); 65 66 return true; 67 } 68 69 void nvmet_execute_identify_ctrl_zns(struct nvmet_req *req) 70 { 71 u8 zasl = req->sq->ctrl->subsys->zasl; 72 struct nvmet_ctrl *ctrl = req->sq->ctrl; 73 struct nvme_id_ctrl_zns *id; 74 u16 status; 75 76 id = kzalloc(sizeof(*id), GFP_KERNEL); 77 if (!id) { 78 status = NVME_SC_INTERNAL; 79 goto out; 80 } 81 82 if (ctrl->ops->get_mdts) 83 id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl); 84 else 85 id->zasl = zasl; 86 87 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); 88 89 kfree(id); 90 out: 91 nvmet_req_complete(req, status); 92 } 93 94 void nvmet_execute_identify_ns_zns(struct nvmet_req *req) 95 { 96 struct nvme_id_ns_zns *id_zns = NULL; 97 u64 zsze; 98 u16 status; 99 u32 mar, mor; 100 101 if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { 102 req->error_loc = offsetof(struct nvme_identify, nsid); 103 status = NVME_SC_INVALID_NS | NVME_STATUS_DNR; 104 goto out; 105 } 106 107 id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL); 108 if (!id_zns) { 109 status = NVME_SC_INTERNAL; 110 goto out; 111 } 112 113 status = nvmet_req_find_ns(req); 114 if (status) 115 goto done; 116 117 if (nvmet_ns_revalidate(req->ns)) { 118 mutex_lock(&req->ns->subsys->lock); 119 nvmet_ns_changed(req->ns->subsys, req->ns->nsid); 120 mutex_unlock(&req->ns->subsys->lock); 121 } 122 123 if (!bdev_is_zoned(req->ns->bdev)) { 124 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 125 req->error_loc = offsetof(struct nvme_identify, nsid); 126 goto out; 127 } 128 129 zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >> 130 req->ns->blksize_shift; 131 id_zns->lbafe[0].zsze = cpu_to_le64(zsze); 132 133 mor = bdev_max_open_zones(req->ns->bdev); 134 if (!mor) 135 mor = U32_MAX; 136 else 137 mor--; 138 id_zns->mor = cpu_to_le32(mor); 139 140 mar = bdev_max_active_zones(req->ns->bdev); 141 if (!mar) 142 mar = U32_MAX; 143 else 144 mar--; 145 id_zns->mar = cpu_to_le32(mar); 146 147 done: 148 status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns)); 149 out: 150 kfree(id_zns); 151 nvmet_req_complete(req, status); 152 } 153 154 static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req) 155 { 156 sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); 157 u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; 158 159 if (sect >= get_capacity(req->ns->bdev->bd_disk)) { 160 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, slba); 161 return NVME_SC_LBA_RANGE | NVME_STATUS_DNR; 162 } 163 164 if (out_bufsize < sizeof(struct nvme_zone_report)) { 165 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, numd); 166 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 167 } 168 169 if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) { 170 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra); 171 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 172 } 173 174 switch (req->cmd->zmr.pr) { 175 case 0: 176 case 1: 177 break; 178 default: 179 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr); 180 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 181 } 182 183 switch (req->cmd->zmr.zrasf) { 184 case NVME_ZRASF_ZONE_REPORT_ALL: 185 case NVME_ZRASF_ZONE_STATE_EMPTY: 186 case NVME_ZRASF_ZONE_STATE_IMP_OPEN: 187 case NVME_ZRASF_ZONE_STATE_EXP_OPEN: 188 case NVME_ZRASF_ZONE_STATE_CLOSED: 189 case NVME_ZRASF_ZONE_STATE_FULL: 190 case NVME_ZRASF_ZONE_STATE_READONLY: 191 case NVME_ZRASF_ZONE_STATE_OFFLINE: 192 break; 193 default: 194 req->error_loc = 195 offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf); 196 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 197 } 198 199 return NVME_SC_SUCCESS; 200 } 201 202 struct nvmet_report_zone_data { 203 struct nvmet_req *req; 204 u64 out_buf_offset; 205 u64 out_nr_zones; 206 u64 nr_zones; 207 u8 zrasf; 208 }; 209 210 static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned i, void *d) 211 { 212 static const unsigned int nvme_zrasf_to_blk_zcond[] = { 213 [NVME_ZRASF_ZONE_STATE_EMPTY] = BLK_ZONE_COND_EMPTY, 214 [NVME_ZRASF_ZONE_STATE_IMP_OPEN] = BLK_ZONE_COND_IMP_OPEN, 215 [NVME_ZRASF_ZONE_STATE_EXP_OPEN] = BLK_ZONE_COND_EXP_OPEN, 216 [NVME_ZRASF_ZONE_STATE_CLOSED] = BLK_ZONE_COND_CLOSED, 217 [NVME_ZRASF_ZONE_STATE_READONLY] = BLK_ZONE_COND_READONLY, 218 [NVME_ZRASF_ZONE_STATE_FULL] = BLK_ZONE_COND_FULL, 219 [NVME_ZRASF_ZONE_STATE_OFFLINE] = BLK_ZONE_COND_OFFLINE, 220 }; 221 struct nvmet_report_zone_data *rz = d; 222 223 if (rz->zrasf != NVME_ZRASF_ZONE_REPORT_ALL && 224 z->cond != nvme_zrasf_to_blk_zcond[rz->zrasf]) 225 return 0; 226 227 if (rz->nr_zones < rz->out_nr_zones) { 228 struct nvme_zone_descriptor zdesc = { }; 229 u16 status; 230 231 zdesc.zcap = nvmet_sect_to_lba(rz->req->ns, z->capacity); 232 zdesc.zslba = nvmet_sect_to_lba(rz->req->ns, z->start); 233 zdesc.wp = nvmet_sect_to_lba(rz->req->ns, z->wp); 234 zdesc.za = z->reset ? 1 << 2 : 0; 235 zdesc.zs = z->cond << 4; 236 zdesc.zt = z->type; 237 238 status = nvmet_copy_to_sgl(rz->req, rz->out_buf_offset, &zdesc, 239 sizeof(zdesc)); 240 if (status) 241 return -EINVAL; 242 243 rz->out_buf_offset += sizeof(zdesc); 244 } 245 246 rz->nr_zones++; 247 248 return 0; 249 } 250 251 static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req) 252 { 253 unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); 254 255 return bdev_nr_zones(req->ns->bdev) - bdev_zone_no(req->ns->bdev, sect); 256 } 257 258 static unsigned long get_nr_zones_from_buf(struct nvmet_req *req, u32 bufsize) 259 { 260 if (bufsize <= sizeof(struct nvme_zone_report)) 261 return 0; 262 263 return (bufsize - sizeof(struct nvme_zone_report)) / 264 sizeof(struct nvme_zone_descriptor); 265 } 266 267 static void nvmet_bdev_zone_zmgmt_recv_work(struct work_struct *w) 268 { 269 struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work); 270 sector_t start_sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); 271 unsigned long req_slba_nr_zones = nvmet_req_nr_zones_from_slba(req); 272 u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; 273 __le64 nr_zones; 274 u16 status; 275 int ret; 276 struct nvmet_report_zone_data rz_data = { 277 .out_nr_zones = get_nr_zones_from_buf(req, out_bufsize), 278 /* leave the place for report zone header */ 279 .out_buf_offset = sizeof(struct nvme_zone_report), 280 .zrasf = req->cmd->zmr.zrasf, 281 .nr_zones = 0, 282 .req = req, 283 }; 284 285 status = nvmet_bdev_validate_zone_mgmt_recv(req); 286 if (status) 287 goto out; 288 289 if (!req_slba_nr_zones) { 290 status = NVME_SC_SUCCESS; 291 goto out; 292 } 293 294 ret = blkdev_report_zones(req->ns->bdev, start_sect, req_slba_nr_zones, 295 nvmet_bdev_report_zone_cb, &rz_data); 296 if (ret < 0) { 297 status = NVME_SC_INTERNAL; 298 goto out; 299 } 300 301 /* 302 * When partial bit is set nr_zones must indicate the number of zone 303 * descriptors actually transferred. 304 */ 305 if (req->cmd->zmr.pr) 306 rz_data.nr_zones = min(rz_data.nr_zones, rz_data.out_nr_zones); 307 308 nr_zones = cpu_to_le64(rz_data.nr_zones); 309 status = nvmet_copy_to_sgl(req, 0, &nr_zones, sizeof(nr_zones)); 310 311 out: 312 nvmet_req_complete(req, status); 313 } 314 315 void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req) 316 { 317 INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zone_zmgmt_recv_work); 318 queue_work(zbd_wq, &req->z.zmgmt_work); 319 } 320 321 static inline enum req_op zsa_req_op(u8 zsa) 322 { 323 switch (zsa) { 324 case NVME_ZONE_OPEN: 325 return REQ_OP_ZONE_OPEN; 326 case NVME_ZONE_CLOSE: 327 return REQ_OP_ZONE_CLOSE; 328 case NVME_ZONE_FINISH: 329 return REQ_OP_ZONE_FINISH; 330 case NVME_ZONE_RESET: 331 return REQ_OP_ZONE_RESET; 332 default: 333 return REQ_OP_LAST; 334 } 335 } 336 337 static u16 blkdev_zone_mgmt_errno_to_nvme_status(int ret) 338 { 339 switch (ret) { 340 case 0: 341 return NVME_SC_SUCCESS; 342 case -EINVAL: 343 case -EIO: 344 return NVME_SC_ZONE_INVALID_TRANSITION | NVME_STATUS_DNR; 345 default: 346 return NVME_SC_INTERNAL; 347 } 348 } 349 350 struct nvmet_zone_mgmt_send_all_data { 351 unsigned long *zbitmap; 352 struct nvmet_req *req; 353 }; 354 355 static int zmgmt_send_scan_cb(struct blk_zone *z, unsigned i, void *d) 356 { 357 struct nvmet_zone_mgmt_send_all_data *data = d; 358 359 switch (zsa_req_op(data->req->cmd->zms.zsa)) { 360 case REQ_OP_ZONE_OPEN: 361 switch (z->cond) { 362 case BLK_ZONE_COND_CLOSED: 363 break; 364 default: 365 return 0; 366 } 367 break; 368 case REQ_OP_ZONE_CLOSE: 369 switch (z->cond) { 370 case BLK_ZONE_COND_IMP_OPEN: 371 case BLK_ZONE_COND_EXP_OPEN: 372 break; 373 default: 374 return 0; 375 } 376 break; 377 case REQ_OP_ZONE_FINISH: 378 switch (z->cond) { 379 case BLK_ZONE_COND_IMP_OPEN: 380 case BLK_ZONE_COND_EXP_OPEN: 381 case BLK_ZONE_COND_CLOSED: 382 break; 383 default: 384 return 0; 385 } 386 break; 387 default: 388 return -EINVAL; 389 } 390 391 set_bit(i, data->zbitmap); 392 393 return 0; 394 } 395 396 static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req) 397 { 398 struct block_device *bdev = req->ns->bdev; 399 unsigned int nr_zones = bdev_nr_zones(bdev); 400 struct bio *bio = NULL; 401 sector_t sector = 0; 402 int ret; 403 struct nvmet_zone_mgmt_send_all_data d = { 404 .req = req, 405 }; 406 407 d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)), 408 GFP_NOIO, bdev->bd_disk->node_id); 409 if (!d.zbitmap) { 410 ret = -ENOMEM; 411 goto out; 412 } 413 414 /* Scan and build bitmap of the eligible zones */ 415 ret = blkdev_report_zones(bdev, 0, nr_zones, zmgmt_send_scan_cb, &d); 416 if (ret != nr_zones) { 417 if (ret > 0) 418 ret = -EIO; 419 goto out; 420 } else { 421 /* We scanned all the zones */ 422 ret = 0; 423 } 424 425 while (sector < bdev_nr_sectors(bdev)) { 426 if (test_bit(disk_zone_no(bdev->bd_disk, sector), d.zbitmap)) { 427 bio = blk_next_bio(bio, bdev, 0, 428 zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC, 429 GFP_KERNEL); 430 bio->bi_iter.bi_sector = sector; 431 /* This may take a while, so be nice to others */ 432 cond_resched(); 433 } 434 sector += bdev_zone_sectors(bdev); 435 } 436 437 if (bio) { 438 ret = submit_bio_wait(bio); 439 bio_put(bio); 440 } 441 442 out: 443 kfree(d.zbitmap); 444 445 return blkdev_zone_mgmt_errno_to_nvme_status(ret); 446 } 447 448 static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req) 449 { 450 int ret; 451 452 switch (zsa_req_op(req->cmd->zms.zsa)) { 453 case REQ_OP_ZONE_RESET: 454 ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0, 455 get_capacity(req->ns->bdev->bd_disk)); 456 if (ret < 0) 457 return blkdev_zone_mgmt_errno_to_nvme_status(ret); 458 break; 459 case REQ_OP_ZONE_OPEN: 460 case REQ_OP_ZONE_CLOSE: 461 case REQ_OP_ZONE_FINISH: 462 return nvmet_bdev_zone_mgmt_emulate_all(req); 463 default: 464 /* this is needed to quiet compiler warning */ 465 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa); 466 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 467 } 468 469 return NVME_SC_SUCCESS; 470 } 471 472 static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) 473 { 474 struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work); 475 sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba); 476 enum req_op op = zsa_req_op(req->cmd->zms.zsa); 477 struct block_device *bdev = req->ns->bdev; 478 sector_t zone_sectors = bdev_zone_sectors(bdev); 479 u16 status = NVME_SC_SUCCESS; 480 int ret; 481 482 if (op == REQ_OP_LAST) { 483 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa); 484 status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_STATUS_DNR; 485 goto out; 486 } 487 488 /* when select all bit is set slba field is ignored */ 489 if (req->cmd->zms.select_all) { 490 status = nvmet_bdev_execute_zmgmt_send_all(req); 491 goto out; 492 } 493 494 if (sect >= get_capacity(bdev->bd_disk)) { 495 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba); 496 status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR; 497 goto out; 498 } 499 500 if (sect & (zone_sectors - 1)) { 501 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba); 502 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 503 goto out; 504 } 505 506 ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors); 507 if (ret < 0) 508 status = blkdev_zone_mgmt_errno_to_nvme_status(ret); 509 510 out: 511 nvmet_req_complete(req, status); 512 } 513 514 void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req) 515 { 516 INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zmgmt_send_work); 517 queue_work(zbd_wq, &req->z.zmgmt_work); 518 } 519 520 static void nvmet_bdev_zone_append_bio_done(struct bio *bio) 521 { 522 struct nvmet_req *req = bio->bi_private; 523 524 if (bio->bi_status == BLK_STS_OK) { 525 req->cqe->result.u64 = 526 nvmet_sect_to_lba(req->ns, bio->bi_iter.bi_sector); 527 } 528 529 nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); 530 nvmet_req_bio_put(req, bio); 531 } 532 533 void nvmet_bdev_execute_zone_append(struct nvmet_req *req) 534 { 535 sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); 536 const blk_opf_t opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; 537 u16 status = NVME_SC_SUCCESS; 538 unsigned int total_len = 0; 539 struct scatterlist *sg; 540 u32 data_len = nvmet_rw_data_len(req); 541 struct bio *bio; 542 int sg_cnt; 543 544 /* Request is completed on len mismatch in nvmet_check_transter_len() */ 545 if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) 546 return; 547 548 if (data_len > 549 bdev_max_zone_append_sectors(req->ns->bdev) << SECTOR_SHIFT) { 550 req->error_loc = offsetof(struct nvme_rw_command, length); 551 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 552 goto out; 553 } 554 555 if (!req->sg_cnt) { 556 nvmet_req_complete(req, 0); 557 return; 558 } 559 560 if (sect >= get_capacity(req->ns->bdev->bd_disk)) { 561 req->error_loc = offsetof(struct nvme_rw_command, slba); 562 status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR; 563 goto out; 564 } 565 566 if (sect & (bdev_zone_sectors(req->ns->bdev) - 1)) { 567 req->error_loc = offsetof(struct nvme_rw_command, slba); 568 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 569 goto out; 570 } 571 572 if (nvmet_use_inline_bvec(req)) { 573 bio = &req->z.inline_bio; 574 bio_init(bio, req->ns->bdev, req->inline_bvec, 575 ARRAY_SIZE(req->inline_bvec), opf); 576 } else { 577 bio = bio_alloc(req->ns->bdev, req->sg_cnt, opf, GFP_KERNEL); 578 } 579 580 bio->bi_end_io = nvmet_bdev_zone_append_bio_done; 581 bio->bi_iter.bi_sector = sect; 582 bio->bi_private = req; 583 if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) 584 bio->bi_opf |= REQ_FUA; 585 586 for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { 587 unsigned int len = sg->length; 588 589 if (bio_add_pc_page(bdev_get_queue(bio->bi_bdev), bio, 590 sg_page(sg), len, sg->offset) != len) { 591 status = NVME_SC_INTERNAL; 592 goto out_put_bio; 593 } 594 total_len += len; 595 } 596 597 if (total_len != data_len) { 598 status = NVME_SC_INTERNAL | NVME_STATUS_DNR; 599 goto out_put_bio; 600 } 601 602 submit_bio(bio); 603 return; 604 605 out_put_bio: 606 nvmet_req_bio_put(req, bio); 607 out: 608 nvmet_req_complete(req, status); 609 } 610 611 u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req) 612 { 613 struct nvme_command *cmd = req->cmd; 614 615 switch (cmd->common.opcode) { 616 case nvme_cmd_zone_append: 617 req->execute = nvmet_bdev_execute_zone_append; 618 return 0; 619 case nvme_cmd_zone_mgmt_recv: 620 req->execute = nvmet_bdev_execute_zone_mgmt_recv; 621 return 0; 622 case nvme_cmd_zone_mgmt_send: 623 req->execute = nvmet_bdev_execute_zone_mgmt_send; 624 return 0; 625 default: 626 return nvmet_bdev_parse_io_cmd(req); 627 } 628 } 629