1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bio.h> 10 #include <sys/bus.h> 11 #include <sys/conf.h> 12 #include <sys/disk.h> 13 #include <sys/fcntl.h> 14 #include <sys/lock.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/mutex.h> 18 #include <sys/proc.h> 19 #include <sys/refcount.h> 20 #include <sys/sbuf.h> 21 #include <machine/stdarg.h> 22 #include <dev/nvme/nvme.h> 23 #include <dev/nvmf/host/nvmf_var.h> 24 25 struct nvmf_namespace { 26 struct nvmf_softc *sc; 27 uint64_t size; 28 uint32_t id; 29 u_int flags; 30 uint32_t lba_size; 31 bool disconnected; 32 bool shutdown; 33 34 TAILQ_HEAD(, bio) pending_bios; 35 struct mtx lock; 36 volatile u_int active_bios; 37 38 struct cdev *cdev; 39 }; 40 41 static void nvmf_ns_strategy(struct bio *bio); 42 43 static void 44 ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) 45 { 46 char buf[128]; 47 struct sbuf sb; 48 va_list ap; 49 50 sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); 51 sbuf_set_drain(&sb, sbuf_printf_drain, NULL); 52 53 sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev), 54 ns->id); 55 56 va_start(ap, fmt); 57 sbuf_vprintf(&sb, fmt, ap); 58 va_end(ap); 59 60 sbuf_finish(&sb); 61 sbuf_delete(&sb); 62 } 63 64 /* 65 * The I/O completion may trigger after the received CQE if the I/O 66 * used a zero-copy mbuf that isn't harvested until after the NIC 67 * driver processes TX completions. Abuse bio_driver1 as a refcount. 68 * Store I/O errors in bio_driver2. 69 */ 70 static __inline u_int * 71 bio_refs(struct bio *bio) 72 { 73 return ((u_int *)&bio->bio_driver1); 74 } 75 76 static void 77 nvmf_ns_biodone(struct bio *bio) 78 { 79 struct nvmf_namespace *ns; 80 int error; 81 82 if (!refcount_release(bio_refs(bio))) 83 return; 84 85 ns = bio->bio_dev->si_drv1; 86 87 /* If a request is aborted, resubmit or queue it for resubmission. */ 88 if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) { 89 bio->bio_error = 0; 90 bio->bio_driver2 = 0; 91 mtx_lock(&ns->lock); 92 if (ns->disconnected) { 93 if (nvmf_fail_disconnect || ns->shutdown) { 94 mtx_unlock(&ns->lock); 95 bio->bio_error = ECONNABORTED; 96 bio->bio_flags |= BIO_ERROR; 97 bio->bio_resid = bio->bio_bcount; 98 biodone(bio); 99 } else { 100 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, 101 bio_queue); 102 mtx_unlock(&ns->lock); 103 } 104 } else { 105 mtx_unlock(&ns->lock); 106 nvmf_ns_strategy(bio); 107 } 108 } else { 109 /* 110 * I/O errors take precedence over generic EIO from 111 * CQE errors. 112 */ 113 error = (intptr_t)bio->bio_driver2; 114 if (error != 0) 115 bio->bio_error = error; 116 if (bio->bio_error != 0) 117 bio->bio_flags |= BIO_ERROR; 118 biodone(bio); 119 } 120 121 if (refcount_release(&ns->active_bios)) 122 wakeup(ns); 123 } 124 125 static void 126 nvmf_ns_io_complete(void *arg, size_t xfered, int error) 127 { 128 struct bio *bio = arg; 129 130 KASSERT(xfered <= bio->bio_bcount, 131 ("%s: xfered > bio_bcount", __func__)); 132 133 bio->bio_driver2 = (void *)(intptr_t)error; 134 bio->bio_resid = bio->bio_bcount - xfered; 135 136 nvmf_ns_biodone(bio); 137 } 138 139 static void 140 nvmf_ns_delete_complete(void *arg, size_t xfered, int error) 141 { 142 struct bio *bio = arg; 143 144 if (error != 0) 145 bio->bio_resid = bio->bio_bcount; 146 else 147 bio->bio_resid = 0; 148 149 free(bio->bio_driver2, M_NVMF); 150 bio->bio_driver2 = (void *)(intptr_t)error; 151 152 nvmf_ns_biodone(bio); 153 } 154 155 static void 156 nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) 157 { 158 struct bio *bio = arg; 159 160 if (nvmf_cqe_aborted(cqe)) 161 bio->bio_error = ECONNABORTED; 162 else if (cqe->status != 0) 163 bio->bio_error = EIO; 164 165 nvmf_ns_biodone(bio); 166 } 167 168 static int 169 nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) 170 { 171 struct nvme_command cmd; 172 struct nvmf_request *req; 173 struct nvme_dsm_range *dsm_range; 174 struct memdesc mem; 175 uint64_t lba, lba_count; 176 int error; 177 178 dsm_range = NULL; 179 memset(&cmd, 0, sizeof(cmd)); 180 switch (bio->bio_cmd) { 181 case BIO_READ: 182 lba = bio->bio_offset / ns->lba_size; 183 lba_count = bio->bio_bcount / ns->lba_size; 184 nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); 185 break; 186 case BIO_WRITE: 187 lba = bio->bio_offset / ns->lba_size; 188 lba_count = bio->bio_bcount / ns->lba_size; 189 nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); 190 break; 191 case BIO_FLUSH: 192 nvme_ns_flush_cmd(&cmd, ns->id); 193 break; 194 case BIO_DELETE: 195 dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | 196 M_ZERO); 197 if (dsm_range == NULL) 198 return (ENOMEM); 199 lba = bio->bio_offset / ns->lba_size; 200 lba_count = bio->bio_bcount / ns->lba_size; 201 dsm_range->starting_lba = htole64(lba); 202 dsm_range->length = htole32(lba_count); 203 204 cmd.opc = NVME_OPC_DATASET_MANAGEMENT; 205 cmd.nsid = htole32(ns->id); 206 cmd.cdw10 = htole32(0); /* 1 range */ 207 cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); 208 break; 209 default: 210 return (EOPNOTSUPP); 211 } 212 213 mtx_lock(&ns->lock); 214 if (ns->disconnected) { 215 if (nvmf_fail_disconnect || ns->shutdown) { 216 error = ECONNABORTED; 217 } else { 218 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 219 error = 0; 220 } 221 mtx_unlock(&ns->lock); 222 free(dsm_range, M_NVMF); 223 return (error); 224 } 225 226 req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, 227 nvmf_ns_bio_complete, bio, M_NOWAIT); 228 if (req == NULL) { 229 mtx_unlock(&ns->lock); 230 free(dsm_range, M_NVMF); 231 return (ENOMEM); 232 } 233 234 switch (bio->bio_cmd) { 235 case BIO_READ: 236 case BIO_WRITE: 237 refcount_init(bio_refs(bio), 2); 238 mem = memdesc_bio(bio); 239 nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, 240 bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); 241 break; 242 case BIO_DELETE: 243 refcount_init(bio_refs(bio), 2); 244 mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); 245 nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), 246 true, nvmf_ns_delete_complete, bio); 247 bio->bio_driver2 = dsm_range; 248 break; 249 default: 250 refcount_init(bio_refs(bio), 1); 251 KASSERT(bio->bio_resid == 0, 252 ("%s: input bio_resid != 0", __func__)); 253 break; 254 } 255 256 refcount_acquire(&ns->active_bios); 257 nvmf_submit_request(req); 258 mtx_unlock(&ns->lock); 259 return (0); 260 } 261 262 static int 263 nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, 264 struct thread *td) 265 { 266 struct nvmf_namespace *ns = dev->si_drv1; 267 struct nvme_get_nsid *gnsid; 268 struct nvme_pt_command *pt; 269 270 switch (cmd) { 271 case NVME_PASSTHROUGH_CMD: 272 pt = (struct nvme_pt_command *)arg; 273 pt->cmd.nsid = htole32(ns->id); 274 return (nvmf_passthrough_cmd(ns->sc, pt, false)); 275 case NVME_GET_NSID: 276 gnsid = (struct nvme_get_nsid *)arg; 277 strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), 278 sizeof(gnsid->cdev)); 279 gnsid->nsid = ns->id; 280 return (0); 281 case DIOCGMEDIASIZE: 282 *(off_t *)arg = ns->size; 283 return (0); 284 case DIOCGSECTORSIZE: 285 *(u_int *)arg = ns->lba_size; 286 return (0); 287 default: 288 return (ENOTTY); 289 } 290 } 291 292 static int 293 nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 294 { 295 int error; 296 297 error = 0; 298 if ((oflags & FWRITE) != 0) 299 error = securelevel_gt(td->td_ucred, 0); 300 return (error); 301 } 302 303 void 304 nvmf_ns_strategy(struct bio *bio) 305 { 306 struct nvmf_namespace *ns; 307 int error; 308 309 ns = bio->bio_dev->si_drv1; 310 311 error = nvmf_ns_submit_bio(ns, bio); 312 if (error != 0) { 313 bio->bio_error = error; 314 bio->bio_flags |= BIO_ERROR; 315 bio->bio_resid = bio->bio_bcount; 316 biodone(bio); 317 } 318 } 319 320 static struct cdevsw nvmf_ns_cdevsw = { 321 .d_version = D_VERSION, 322 .d_flags = D_DISK, 323 .d_open = nvmf_ns_open, 324 .d_read = physread, 325 .d_write = physwrite, 326 .d_strategy = nvmf_ns_strategy, 327 .d_ioctl = nvmf_ns_ioctl 328 }; 329 330 struct nvmf_namespace * 331 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, 332 const struct nvme_namespace_data *data) 333 { 334 struct make_dev_args mda; 335 struct nvmf_namespace *ns; 336 int error; 337 uint8_t lbads, lbaf; 338 339 ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); 340 ns->sc = sc; 341 ns->id = id; 342 TAILQ_INIT(&ns->pending_bios); 343 mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); 344 345 /* One dummy bio avoids dropping to 0 until destroy. */ 346 refcount_init(&ns->active_bios, 1); 347 348 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 349 ns_printf(ns, "End-to-end data protection not supported\n"); 350 goto fail; 351 } 352 353 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 354 if (lbaf > data->nlbaf) { 355 ns_printf(ns, "Invalid LBA format index\n"); 356 goto fail; 357 } 358 359 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 360 ns_printf(ns, "Namespaces with metadata are not supported\n"); 361 goto fail; 362 } 363 364 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 365 if (lbads == 0) { 366 ns_printf(ns, "Invalid LBA format index\n"); 367 goto fail; 368 } 369 370 ns->lba_size = 1 << lbads; 371 ns->size = data->nsze * ns->lba_size; 372 373 if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) 374 ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; 375 376 if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) 377 ns->flags |= NVME_NS_FLUSH_SUPPORTED; 378 379 /* 380 * XXX: Does any of the boundary splitting for NOIOB make any 381 * sense for Fabrics? 382 */ 383 384 make_dev_args_init(&mda); 385 mda.mda_devsw = &nvmf_ns_cdevsw; 386 mda.mda_uid = UID_ROOT; 387 mda.mda_gid = GID_WHEEL; 388 mda.mda_mode = 0600; 389 mda.mda_si_drv1 = ns; 390 error = make_dev_s(&mda, &ns->cdev, "%sn%u", 391 device_get_nameunit(sc->dev), id); 392 if (error != 0) 393 goto fail; 394 ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u", 395 device_get_nameunit(sc->dev), id); 396 397 ns->cdev->si_flags |= SI_UNMAPPED; 398 399 return (ns); 400 fail: 401 mtx_destroy(&ns->lock); 402 free(ns, M_NVMF); 403 return (NULL); 404 } 405 406 void 407 nvmf_disconnect_ns(struct nvmf_namespace *ns) 408 { 409 mtx_lock(&ns->lock); 410 ns->disconnected = true; 411 mtx_unlock(&ns->lock); 412 } 413 414 void 415 nvmf_reconnect_ns(struct nvmf_namespace *ns) 416 { 417 TAILQ_HEAD(, bio) bios; 418 struct bio *bio; 419 420 mtx_lock(&ns->lock); 421 ns->disconnected = false; 422 TAILQ_INIT(&bios); 423 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 424 mtx_unlock(&ns->lock); 425 426 while (!TAILQ_EMPTY(&bios)) { 427 bio = TAILQ_FIRST(&bios); 428 TAILQ_REMOVE(&bios, bio, bio_queue); 429 nvmf_ns_strategy(bio); 430 } 431 } 432 433 void 434 nvmf_shutdown_ns(struct nvmf_namespace *ns) 435 { 436 TAILQ_HEAD(, bio) bios; 437 struct bio *bio; 438 439 mtx_lock(&ns->lock); 440 ns->shutdown = true; 441 TAILQ_INIT(&bios); 442 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 443 mtx_unlock(&ns->lock); 444 445 while (!TAILQ_EMPTY(&bios)) { 446 bio = TAILQ_FIRST(&bios); 447 TAILQ_REMOVE(&bios, bio, bio_queue); 448 bio->bio_error = ECONNABORTED; 449 bio->bio_flags |= BIO_ERROR; 450 bio->bio_resid = bio->bio_bcount; 451 biodone(bio); 452 } 453 } 454 455 void 456 nvmf_destroy_ns(struct nvmf_namespace *ns) 457 { 458 TAILQ_HEAD(, bio) bios; 459 struct bio *bio; 460 461 if (ns->cdev->si_drv2 != NULL) 462 destroy_dev(ns->cdev->si_drv2); 463 destroy_dev(ns->cdev); 464 465 /* 466 * Wait for active I/O requests to drain. The release drops 467 * the reference on the "dummy bio" when the namespace is 468 * created. 469 */ 470 mtx_lock(&ns->lock); 471 if (!refcount_release(&ns->active_bios)) { 472 while (ns->active_bios != 0) 473 mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); 474 } 475 476 /* Abort any pending I/O requests. */ 477 TAILQ_INIT(&bios); 478 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 479 mtx_unlock(&ns->lock); 480 481 while (!TAILQ_EMPTY(&bios)) { 482 bio = TAILQ_FIRST(&bios); 483 TAILQ_REMOVE(&bios, bio, bio_queue); 484 bio->bio_error = ECONNABORTED; 485 bio->bio_flags |= BIO_ERROR; 486 bio->bio_resid = bio->bio_bcount; 487 biodone(bio); 488 } 489 490 mtx_destroy(&ns->lock); 491 free(ns, M_NVMF); 492 } 493 494 bool 495 nvmf_update_ns(struct nvmf_namespace *ns, 496 const struct nvme_namespace_data *data) 497 { 498 uint8_t lbads, lbaf; 499 500 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 501 ns_printf(ns, "End-to-end data protection not supported\n"); 502 return (false); 503 } 504 505 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 506 if (lbaf > data->nlbaf) { 507 ns_printf(ns, "Invalid LBA format index\n"); 508 return (false); 509 } 510 511 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 512 ns_printf(ns, "Namespaces with metadata are not supported\n"); 513 return (false); 514 } 515 516 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 517 if (lbads == 0) { 518 ns_printf(ns, "Invalid LBA format index\n"); 519 return (false); 520 } 521 522 ns->lba_size = 1 << lbads; 523 ns->size = data->nsze * ns->lba_size; 524 return (true); 525 } 526