1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bus.h> 10 #include <sys/conf.h> 11 #include <sys/eventhandler.h> 12 #include <sys/lock.h> 13 #include <sys/kernel.h> 14 #include <sys/malloc.h> 15 #include <sys/memdesc.h> 16 #include <sys/module.h> 17 #include <sys/mutex.h> 18 #include <sys/reboot.h> 19 #include <sys/sx.h> 20 #include <sys/sysctl.h> 21 #include <sys/taskqueue.h> 22 #include <dev/nvme/nvme.h> 23 #include <dev/nvmf/nvmf.h> 24 #include <dev/nvmf/nvmf_transport.h> 25 #include <dev/nvmf/host/nvmf_var.h> 26 27 static struct cdevsw nvmf_cdevsw; 28 29 bool nvmf_fail_disconnect = false; 30 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, 31 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); 32 33 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); 34 35 static void nvmf_disconnect_task(void *arg, int pending); 36 static void nvmf_shutdown_pre_sync(void *arg, int howto); 37 static void nvmf_shutdown_post_sync(void *arg, int howto); 38 39 void 40 nvmf_complete(void *arg, const struct nvme_completion *cqe) 41 { 42 struct nvmf_completion_status *status = arg; 43 struct mtx *mtx; 44 45 status->cqe = *cqe; 46 mtx = mtx_pool_find(mtxpool_sleep, status); 47 mtx_lock(mtx); 48 status->done = true; 49 mtx_unlock(mtx); 50 wakeup(status); 51 } 52 53 void 54 nvmf_io_complete(void *arg, size_t xfered, int error) 55 { 56 struct nvmf_completion_status *status = arg; 57 struct mtx *mtx; 58 59 status->io_error = error; 60 mtx = mtx_pool_find(mtxpool_sleep, status); 61 mtx_lock(mtx); 62 status->io_done = true; 63 mtx_unlock(mtx); 64 wakeup(status); 65 } 66 67 void 68 nvmf_wait_for_reply(struct nvmf_completion_status *status) 69 { 70 struct mtx *mtx; 71 72 mtx = mtx_pool_find(mtxpool_sleep, status); 73 mtx_lock(mtx); 74 while (!status->done || !status->io_done) 75 mtx_sleep(status, mtx, 0, "nvmfcmd", 0); 76 mtx_unlock(mtx); 77 } 78 79 static int 80 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 81 uint64_t *value) 82 { 83 const struct nvmf_fabric_prop_get_rsp *rsp; 84 struct nvmf_completion_status status; 85 86 nvmf_status_init(&status); 87 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, 88 M_WAITOK)) 89 return (ECONNABORTED); 90 nvmf_wait_for_reply(&status); 91 92 if (status.cqe.status != 0) { 93 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", 94 le16toh(status.cqe.status)); 95 return (EIO); 96 } 97 98 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; 99 if (size == 8) 100 *value = le64toh(rsp->value.u64); 101 else 102 *value = le32toh(rsp->value.u32.low); 103 return (0); 104 } 105 106 static int 107 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 108 uint64_t value) 109 { 110 struct nvmf_completion_status status; 111 112 nvmf_status_init(&status); 113 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, 114 M_WAITOK)) 115 return (ECONNABORTED); 116 nvmf_wait_for_reply(&status); 117 118 if (status.cqe.status != 0) { 119 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", 120 le16toh(status.cqe.status)); 121 return (EIO); 122 } 123 return (0); 124 } 125 126 static void 127 nvmf_shutdown_controller(struct nvmf_softc *sc) 128 { 129 uint64_t cc; 130 int error; 131 132 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); 133 if (error != 0) { 134 device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); 135 return; 136 } 137 138 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); 139 140 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); 141 if (error != 0) 142 device_printf(sc->dev, 143 "Failed to set CC to trigger shutdown\n"); 144 } 145 146 static void 147 nvmf_check_keep_alive(void *arg) 148 { 149 struct nvmf_softc *sc = arg; 150 int traffic; 151 152 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); 153 if (traffic == 0) { 154 device_printf(sc->dev, 155 "disconnecting due to KeepAlive timeout\n"); 156 nvmf_disconnect(sc); 157 return; 158 } 159 160 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); 161 } 162 163 static void 164 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) 165 { 166 struct nvmf_softc *sc = arg; 167 168 atomic_store_int(&sc->ka_active_rx_traffic, 1); 169 if (cqe->status != 0) { 170 device_printf(sc->dev, 171 "KeepAlive response reported status %#x\n", 172 le16toh(cqe->status)); 173 } 174 } 175 176 static void 177 nvmf_send_keep_alive(void *arg) 178 { 179 struct nvmf_softc *sc = arg; 180 int traffic; 181 182 /* 183 * Don't bother sending a KeepAlive command if TKAS is active 184 * and another command has been sent during the interval. 185 */ 186 traffic = atomic_load_int(&sc->ka_active_tx_traffic); 187 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, 188 sc, M_NOWAIT)) 189 device_printf(sc->dev, 190 "Failed to allocate KeepAlive command\n"); 191 192 /* Clear ka_active_tx_traffic after sending the keep alive command. */ 193 atomic_store_int(&sc->ka_active_tx_traffic, 0); 194 195 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); 196 } 197 198 int 199 nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh) 200 { 201 size_t len; 202 u_int i; 203 int error; 204 205 memset(ivars, 0, sizeof(*ivars)); 206 207 if (!hh->admin.admin || hh->num_io_queues < 1) 208 return (EINVAL); 209 210 ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK); 211 error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata)); 212 if (error != 0) 213 goto out; 214 nvme_controller_data_swapbytes(ivars->cdata); 215 216 len = hh->num_io_queues * sizeof(*ivars->io_params); 217 ivars->io_params = malloc(len, M_NVMF, M_WAITOK); 218 error = copyin(hh->io, ivars->io_params, len); 219 if (error != 0) 220 goto out; 221 for (i = 0; i < hh->num_io_queues; i++) { 222 if (ivars->io_params[i].admin) { 223 error = EINVAL; 224 goto out; 225 } 226 227 /* Require all I/O queues to be the same size. */ 228 if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) { 229 error = EINVAL; 230 goto out; 231 } 232 } 233 234 ivars->hh = hh; 235 return (0); 236 237 out: 238 free(ivars->io_params, M_NVMF); 239 free(ivars->cdata, M_NVMF); 240 return (error); 241 } 242 243 void 244 nvmf_free_ivars(struct nvmf_ivars *ivars) 245 { 246 free(ivars->io_params, M_NVMF); 247 free(ivars->cdata, M_NVMF); 248 } 249 250 static int 251 nvmf_probe(device_t dev) 252 { 253 struct nvmf_ivars *ivars = device_get_ivars(dev); 254 255 if (ivars == NULL) 256 return (ENXIO); 257 258 device_set_descf(dev, "Fabrics: %.256s", ivars->cdata->subnqn); 259 return (BUS_PROBE_DEFAULT); 260 } 261 262 static int 263 nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) 264 { 265 char name[16]; 266 267 /* Setup the admin queue. */ 268 sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin, 269 "admin queue", 0); 270 if (sc->admin == NULL) { 271 device_printf(sc->dev, "Failed to setup admin queue\n"); 272 return (ENXIO); 273 } 274 275 /* Setup I/O queues. */ 276 sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF, 277 M_WAITOK | M_ZERO); 278 sc->num_io_queues = ivars->hh->num_io_queues; 279 for (u_int i = 0; i < sc->num_io_queues; i++) { 280 snprintf(name, sizeof(name), "I/O queue %u", i); 281 sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype, 282 &ivars->io_params[i], name, i); 283 if (sc->io[i] == NULL) { 284 device_printf(sc->dev, "Failed to setup I/O queue %u\n", 285 i + 1); 286 return (ENXIO); 287 } 288 } 289 290 /* Start KeepAlive timers. */ 291 if (ivars->hh->kato != 0) { 292 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, 293 sc->cdata->ctratt) != 0; 294 sc->ka_rx_sbt = mstosbt(ivars->hh->kato); 295 sc->ka_tx_sbt = sc->ka_rx_sbt / 2; 296 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, 297 nvmf_check_keep_alive, sc, C_HARDCLOCK); 298 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, 299 nvmf_send_keep_alive, sc, C_HARDCLOCK); 300 } 301 302 return (0); 303 } 304 305 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, 306 const struct nvme_namespace_data *, void *); 307 308 static bool 309 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, 310 struct nvme_namespace_data *data, uint32_t *nsidp, 311 nvmf_scan_active_ns_cb *cb, void *cb_arg) 312 { 313 struct nvmf_completion_status status; 314 uint32_t nsid; 315 316 nvmf_status_init(&status); 317 nvmf_status_wait_io(&status); 318 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, 319 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { 320 device_printf(sc->dev, 321 "failed to send IDENTIFY active namespaces command\n"); 322 return (false); 323 } 324 nvmf_wait_for_reply(&status); 325 326 if (status.cqe.status != 0) { 327 device_printf(sc->dev, 328 "IDENTIFY active namespaces failed, status %#x\n", 329 le16toh(status.cqe.status)); 330 return (false); 331 } 332 333 if (status.io_error != 0) { 334 device_printf(sc->dev, 335 "IDENTIFY active namespaces failed with I/O error %d\n", 336 status.io_error); 337 return (false); 338 } 339 340 for (u_int i = 0; i < nitems(nslist->ns); i++) { 341 nsid = nslist->ns[i]; 342 if (nsid == 0) { 343 *nsidp = 0; 344 return (true); 345 } 346 347 nvmf_status_init(&status); 348 nvmf_status_wait_io(&status); 349 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 350 &status, nvmf_io_complete, &status, M_WAITOK)) { 351 device_printf(sc->dev, 352 "failed to send IDENTIFY namespace %u command\n", 353 nsid); 354 return (false); 355 } 356 nvmf_wait_for_reply(&status); 357 358 if (status.cqe.status != 0) { 359 device_printf(sc->dev, 360 "IDENTIFY namespace %u failed, status %#x\n", nsid, 361 le16toh(status.cqe.status)); 362 return (false); 363 } 364 365 if (status.io_error != 0) { 366 device_printf(sc->dev, 367 "IDENTIFY namespace %u failed with I/O error %d\n", 368 nsid, status.io_error); 369 return (false); 370 } 371 372 nvme_namespace_data_swapbytes(data); 373 if (!cb(sc, nsid, data, cb_arg)) 374 return (false); 375 } 376 377 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); 378 379 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) 380 *nsidp = 0; 381 else 382 *nsidp = nsid; 383 return (true); 384 } 385 386 static bool 387 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, 388 void *cb_arg) 389 { 390 struct nvme_namespace_data *data; 391 struct nvme_ns_list *nslist; 392 uint32_t nsid; 393 bool retval; 394 395 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); 396 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 397 398 nsid = 0; 399 retval = true; 400 for (;;) { 401 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, 402 cb_arg)) { 403 retval = false; 404 break; 405 } 406 if (nsid == 0) 407 break; 408 } 409 410 free(data, M_NVMF); 411 free(nslist, M_NVMF); 412 return (retval); 413 } 414 415 static bool 416 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, 417 const struct nvme_namespace_data *data, void *arg __unused) 418 { 419 if (sc->ns[nsid - 1] != NULL) { 420 device_printf(sc->dev, 421 "duplicate namespace %u in active namespace list\n", 422 nsid); 423 return (false); 424 } 425 426 /* 427 * As in nvme_ns_construct, a size of zero indicates an 428 * invalid namespace. 429 */ 430 if (data->nsze == 0) { 431 device_printf(sc->dev, 432 "ignoring active namespace %u with zero size\n", nsid); 433 return (true); 434 } 435 436 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 437 438 nvmf_sim_rescan_ns(sc, nsid); 439 return (true); 440 } 441 442 static bool 443 nvmf_add_namespaces(struct nvmf_softc *sc) 444 { 445 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, 446 M_WAITOK | M_ZERO); 447 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); 448 } 449 450 static int 451 nvmf_attach(device_t dev) 452 { 453 struct make_dev_args mda; 454 struct nvmf_softc *sc = device_get_softc(dev); 455 struct nvmf_ivars *ivars = device_get_ivars(dev); 456 struct sysctl_oid *oid; 457 uint64_t val; 458 u_int i; 459 int error; 460 461 if (ivars == NULL) 462 return (ENXIO); 463 464 sc->dev = dev; 465 sc->trtype = ivars->hh->trtype; 466 callout_init(&sc->ka_rx_timer, 1); 467 callout_init(&sc->ka_tx_timer, 1); 468 sx_init(&sc->connection_lock, "nvmf connection"); 469 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); 470 471 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), 472 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", 473 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); 474 sc->ioq_oid_list = SYSCTL_CHILDREN(oid); 475 476 /* Claim the cdata pointer from ivars. */ 477 sc->cdata = ivars->cdata; 478 ivars->cdata = NULL; 479 480 nvmf_init_aer(sc); 481 482 error = nvmf_establish_connection(sc, ivars); 483 if (error != 0) 484 goto out; 485 486 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); 487 if (error != 0) { 488 device_printf(sc->dev, "Failed to fetch CAP\n"); 489 error = ENXIO; 490 goto out; 491 } 492 493 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); 494 if (error != 0) { 495 device_printf(sc->dev, "Failed to fetch VS\n"); 496 error = ENXIO; 497 goto out; 498 } 499 sc->vs = val; 500 501 /* Honor MDTS if it is set. */ 502 sc->max_xfer_size = maxphys; 503 if (sc->cdata->mdts != 0) { 504 sc->max_xfer_size = ulmin(sc->max_xfer_size, 505 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + 506 NVME_CAP_HI_MPSMIN(sc->cap >> 32))); 507 } 508 509 sc->max_pending_io = ivars->io_params[0].qsize * sc->num_io_queues; 510 511 error = nvmf_init_sim(sc); 512 if (error != 0) 513 goto out; 514 515 error = nvmf_start_aer(sc); 516 if (error != 0) { 517 nvmf_destroy_sim(sc); 518 goto out; 519 } 520 521 if (!nvmf_add_namespaces(sc)) { 522 nvmf_destroy_sim(sc); 523 goto out; 524 } 525 526 make_dev_args_init(&mda); 527 mda.mda_devsw = &nvmf_cdevsw; 528 mda.mda_uid = UID_ROOT; 529 mda.mda_gid = GID_WHEEL; 530 mda.mda_mode = 0600; 531 mda.mda_si_drv1 = sc; 532 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); 533 if (error != 0) { 534 nvmf_destroy_sim(sc); 535 goto out; 536 } 537 538 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, 539 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); 540 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, 541 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST); 542 543 return (0); 544 out: 545 if (sc->ns != NULL) { 546 for (i = 0; i < sc->cdata->nn; i++) { 547 if (sc->ns[i] != NULL) 548 nvmf_destroy_ns(sc->ns[i]); 549 } 550 free(sc->ns, M_NVMF); 551 } 552 553 callout_drain(&sc->ka_tx_timer); 554 callout_drain(&sc->ka_rx_timer); 555 556 if (sc->admin != NULL) 557 nvmf_shutdown_controller(sc); 558 559 for (i = 0; i < sc->num_io_queues; i++) { 560 if (sc->io[i] != NULL) 561 nvmf_destroy_qp(sc->io[i]); 562 } 563 free(sc->io, M_NVMF); 564 if (sc->admin != NULL) 565 nvmf_destroy_qp(sc->admin); 566 567 nvmf_destroy_aer(sc); 568 569 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 570 sx_destroy(&sc->connection_lock); 571 free(sc->cdata, M_NVMF); 572 return (error); 573 } 574 575 void 576 nvmf_disconnect(struct nvmf_softc *sc) 577 { 578 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); 579 } 580 581 static void 582 nvmf_disconnect_task(void *arg, int pending __unused) 583 { 584 struct nvmf_softc *sc = arg; 585 u_int i; 586 587 sx_xlock(&sc->connection_lock); 588 if (sc->admin == NULL) { 589 /* 590 * Ignore transport errors if there is no active 591 * association. 592 */ 593 sx_xunlock(&sc->connection_lock); 594 return; 595 } 596 597 if (sc->detaching) { 598 if (sc->admin != NULL) { 599 /* 600 * This unsticks the detach process if a 601 * transport error occurs during detach. 602 */ 603 nvmf_shutdown_qp(sc->admin); 604 } 605 sx_xunlock(&sc->connection_lock); 606 return; 607 } 608 609 if (sc->cdev == NULL) { 610 /* 611 * Transport error occurred during attach (nvmf_add_namespaces). 612 * Shutdown the admin queue. 613 */ 614 nvmf_shutdown_qp(sc->admin); 615 sx_xunlock(&sc->connection_lock); 616 return; 617 } 618 619 callout_drain(&sc->ka_tx_timer); 620 callout_drain(&sc->ka_rx_timer); 621 sc->ka_traffic = false; 622 623 /* Quiesce namespace consumers. */ 624 nvmf_disconnect_sim(sc); 625 for (i = 0; i < sc->cdata->nn; i++) { 626 if (sc->ns[i] != NULL) 627 nvmf_disconnect_ns(sc->ns[i]); 628 } 629 630 /* Shutdown the existing qpairs. */ 631 for (i = 0; i < sc->num_io_queues; i++) { 632 nvmf_destroy_qp(sc->io[i]); 633 } 634 free(sc->io, M_NVMF); 635 sc->io = NULL; 636 sc->num_io_queues = 0; 637 nvmf_destroy_qp(sc->admin); 638 sc->admin = NULL; 639 640 sx_xunlock(&sc->connection_lock); 641 } 642 643 static int 644 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) 645 { 646 struct nvmf_ivars ivars; 647 u_int i; 648 int error; 649 650 /* XXX: Should we permit changing the transport type? */ 651 if (sc->trtype != hh->trtype) { 652 device_printf(sc->dev, 653 "transport type mismatch on reconnect\n"); 654 return (EINVAL); 655 } 656 657 error = nvmf_init_ivars(&ivars, hh); 658 if (error != 0) 659 return (error); 660 661 sx_xlock(&sc->connection_lock); 662 if (sc->admin != NULL || sc->detaching) { 663 error = EBUSY; 664 goto out; 665 } 666 667 /* 668 * Ensure this is for the same controller. Note that the 669 * controller ID can vary across associations if the remote 670 * system is using the dynamic controller model. This merely 671 * ensures the new association is connected to the same NVMe 672 * subsystem. 673 */ 674 if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn, 675 sizeof(ivars.cdata->subnqn)) != 0) { 676 device_printf(sc->dev, 677 "controller subsystem NQN mismatch on reconnect\n"); 678 error = EINVAL; 679 goto out; 680 } 681 682 /* 683 * XXX: Require same number and size of I/O queues so that 684 * max_pending_io is still correct? 685 */ 686 687 error = nvmf_establish_connection(sc, &ivars); 688 if (error != 0) 689 goto out; 690 691 error = nvmf_start_aer(sc); 692 if (error != 0) 693 goto out; 694 695 device_printf(sc->dev, 696 "established new association with %u I/O queues\n", 697 sc->num_io_queues); 698 699 /* Restart namespace consumers. */ 700 for (i = 0; i < sc->cdata->nn; i++) { 701 if (sc->ns[i] != NULL) 702 nvmf_reconnect_ns(sc->ns[i]); 703 } 704 nvmf_reconnect_sim(sc); 705 706 nvmf_rescan_all_ns(sc); 707 out: 708 sx_xunlock(&sc->connection_lock); 709 nvmf_free_ivars(&ivars); 710 return (error); 711 } 712 713 static void 714 nvmf_shutdown_pre_sync(void *arg, int howto) 715 { 716 struct nvmf_softc *sc = arg; 717 718 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 719 return; 720 721 /* 722 * If this association is disconnected, abort any pending 723 * requests with an error to permit filesystems to unmount 724 * without hanging. 725 */ 726 sx_xlock(&sc->connection_lock); 727 if (sc->admin != NULL || sc->detaching) { 728 sx_xunlock(&sc->connection_lock); 729 return; 730 } 731 732 for (u_int i = 0; i < sc->cdata->nn; i++) { 733 if (sc->ns[i] != NULL) 734 nvmf_shutdown_ns(sc->ns[i]); 735 } 736 nvmf_shutdown_sim(sc); 737 sx_xunlock(&sc->connection_lock); 738 } 739 740 static void 741 nvmf_shutdown_post_sync(void *arg, int howto) 742 { 743 struct nvmf_softc *sc = arg; 744 745 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 746 return; 747 748 /* 749 * If this association is connected, disconnect gracefully. 750 */ 751 sx_xlock(&sc->connection_lock); 752 if (sc->admin == NULL || sc->detaching) { 753 sx_xunlock(&sc->connection_lock); 754 return; 755 } 756 757 callout_drain(&sc->ka_tx_timer); 758 callout_drain(&sc->ka_rx_timer); 759 760 nvmf_shutdown_controller(sc); 761 for (u_int i = 0; i < sc->num_io_queues; i++) { 762 nvmf_destroy_qp(sc->io[i]); 763 } 764 nvmf_destroy_qp(sc->admin); 765 sc->admin = NULL; 766 sx_xunlock(&sc->connection_lock); 767 } 768 769 static int 770 nvmf_detach(device_t dev) 771 { 772 struct nvmf_softc *sc = device_get_softc(dev); 773 u_int i; 774 775 destroy_dev(sc->cdev); 776 777 sx_xlock(&sc->connection_lock); 778 sc->detaching = true; 779 sx_xunlock(&sc->connection_lock); 780 781 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); 782 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); 783 784 nvmf_destroy_sim(sc); 785 for (i = 0; i < sc->cdata->nn; i++) { 786 if (sc->ns[i] != NULL) 787 nvmf_destroy_ns(sc->ns[i]); 788 } 789 free(sc->ns, M_NVMF); 790 791 callout_drain(&sc->ka_tx_timer); 792 callout_drain(&sc->ka_rx_timer); 793 794 if (sc->admin != NULL) 795 nvmf_shutdown_controller(sc); 796 797 for (i = 0; i < sc->num_io_queues; i++) { 798 nvmf_destroy_qp(sc->io[i]); 799 } 800 free(sc->io, M_NVMF); 801 802 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 803 804 if (sc->admin != NULL) 805 nvmf_destroy_qp(sc->admin); 806 807 nvmf_destroy_aer(sc); 808 809 sx_destroy(&sc->connection_lock); 810 free(sc->cdata, M_NVMF); 811 return (0); 812 } 813 814 static void 815 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, 816 const struct nvme_namespace_data *data) 817 { 818 struct nvmf_namespace *ns; 819 820 /* XXX: Needs locking around sc->ns[]. */ 821 ns = sc->ns[nsid - 1]; 822 if (data->nsze == 0) { 823 /* XXX: Needs locking */ 824 if (ns != NULL) { 825 nvmf_destroy_ns(ns); 826 sc->ns[nsid - 1] = NULL; 827 } 828 } else { 829 /* XXX: Needs locking */ 830 if (ns == NULL) { 831 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 832 } else { 833 if (!nvmf_update_ns(ns, data)) { 834 nvmf_destroy_ns(ns); 835 sc->ns[nsid - 1] = NULL; 836 } 837 } 838 } 839 840 nvmf_sim_rescan_ns(sc, nsid); 841 } 842 843 void 844 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) 845 { 846 struct nvmf_completion_status status; 847 struct nvme_namespace_data *data; 848 849 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 850 851 nvmf_status_init(&status); 852 nvmf_status_wait_io(&status); 853 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 854 &status, nvmf_io_complete, &status, M_WAITOK)) { 855 device_printf(sc->dev, 856 "failed to send IDENTIFY namespace %u command\n", nsid); 857 free(data, M_NVMF); 858 return; 859 } 860 nvmf_wait_for_reply(&status); 861 862 if (status.cqe.status != 0) { 863 device_printf(sc->dev, 864 "IDENTIFY namespace %u failed, status %#x\n", nsid, 865 le16toh(status.cqe.status)); 866 free(data, M_NVMF); 867 return; 868 } 869 870 if (status.io_error != 0) { 871 device_printf(sc->dev, 872 "IDENTIFY namespace %u failed with I/O error %d\n", 873 nsid, status.io_error); 874 free(data, M_NVMF); 875 return; 876 } 877 878 nvme_namespace_data_swapbytes(data); 879 880 nvmf_rescan_ns_1(sc, nsid, data); 881 882 free(data, M_NVMF); 883 } 884 885 static void 886 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, 887 uint32_t next_valid_nsid) 888 { 889 struct nvmf_namespace *ns; 890 891 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) 892 { 893 /* XXX: Needs locking around sc->ns[]. */ 894 ns = sc->ns[nsid - 1]; 895 if (ns != NULL) { 896 nvmf_destroy_ns(ns); 897 sc->ns[nsid - 1] = NULL; 898 899 nvmf_sim_rescan_ns(sc, nsid); 900 } 901 } 902 } 903 904 static bool 905 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, 906 const struct nvme_namespace_data *data, void *arg) 907 { 908 uint32_t *last_nsid = arg; 909 910 /* Check for any gaps prior to this namespace. */ 911 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); 912 *last_nsid = nsid; 913 914 nvmf_rescan_ns_1(sc, nsid, data); 915 return (true); 916 } 917 918 void 919 nvmf_rescan_all_ns(struct nvmf_softc *sc) 920 { 921 uint32_t last_nsid; 922 923 last_nsid = 0; 924 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) 925 return; 926 927 /* 928 * Check for any namespace devices after the last active 929 * namespace. 930 */ 931 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); 932 } 933 934 int 935 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, 936 bool admin) 937 { 938 struct nvmf_completion_status status; 939 struct nvme_command cmd; 940 struct memdesc mem; 941 struct nvmf_host_qpair *qp; 942 struct nvmf_request *req; 943 void *buf; 944 int error; 945 946 if (pt->len > sc->max_xfer_size) 947 return (EINVAL); 948 949 buf = NULL; 950 if (pt->len != 0) { 951 /* 952 * XXX: Depending on the size we may want to pin the 953 * user pages and use a memdesc with vm_page_t's 954 * instead. 955 */ 956 buf = malloc(pt->len, M_NVMF, M_WAITOK); 957 if (pt->is_read == 0) { 958 error = copyin(pt->buf, buf, pt->len); 959 if (error != 0) { 960 free(buf, M_NVMF); 961 return (error); 962 } 963 } else { 964 /* Ensure no kernel data is leaked to userland. */ 965 memset(buf, 0, pt->len); 966 } 967 } 968 969 memset(&cmd, 0, sizeof(cmd)); 970 cmd.opc = pt->cmd.opc; 971 cmd.fuse = pt->cmd.fuse; 972 cmd.nsid = pt->cmd.nsid; 973 cmd.cdw10 = pt->cmd.cdw10; 974 cmd.cdw11 = pt->cmd.cdw11; 975 cmd.cdw12 = pt->cmd.cdw12; 976 cmd.cdw13 = pt->cmd.cdw13; 977 cmd.cdw14 = pt->cmd.cdw14; 978 cmd.cdw15 = pt->cmd.cdw15; 979 980 sx_slock(&sc->connection_lock); 981 if (sc->admin == NULL || sc->detaching) { 982 device_printf(sc->dev, 983 "failed to send passthrough command\n"); 984 error = ECONNABORTED; 985 sx_sunlock(&sc->connection_lock); 986 goto error; 987 } 988 if (admin) 989 qp = sc->admin; 990 else 991 qp = nvmf_select_io_queue(sc); 992 nvmf_status_init(&status); 993 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); 994 sx_sunlock(&sc->connection_lock); 995 if (req == NULL) { 996 device_printf(sc->dev, "failed to send passthrough command\n"); 997 error = ECONNABORTED; 998 goto error; 999 } 1000 1001 if (pt->len != 0) { 1002 mem = memdesc_vaddr(buf, pt->len); 1003 nvmf_capsule_append_data(req->nc, &mem, pt->len, 1004 pt->is_read == 0, nvmf_io_complete, &status); 1005 nvmf_status_wait_io(&status); 1006 } 1007 1008 nvmf_submit_request(req); 1009 nvmf_wait_for_reply(&status); 1010 1011 memset(&pt->cpl, 0, sizeof(pt->cpl)); 1012 pt->cpl.cdw0 = status.cqe.cdw0; 1013 pt->cpl.status = status.cqe.status; 1014 1015 error = status.io_error; 1016 if (error == 0 && pt->len != 0 && pt->is_read != 0) 1017 error = copyout(buf, pt->buf, pt->len); 1018 error: 1019 free(buf, M_NVMF); 1020 return (error); 1021 } 1022 1023 static int 1024 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1025 struct thread *td) 1026 { 1027 struct nvmf_softc *sc = cdev->si_drv1; 1028 struct nvme_get_nsid *gnsid; 1029 struct nvme_pt_command *pt; 1030 struct nvmf_reconnect_params *rp; 1031 struct nvmf_handoff_host *hh; 1032 1033 switch (cmd) { 1034 case NVME_PASSTHROUGH_CMD: 1035 pt = (struct nvme_pt_command *)arg; 1036 return (nvmf_passthrough_cmd(sc, pt, true)); 1037 case NVME_GET_NSID: 1038 gnsid = (struct nvme_get_nsid *)arg; 1039 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), 1040 sizeof(gnsid->cdev)); 1041 gnsid->nsid = 0; 1042 return (0); 1043 case NVME_GET_MAX_XFER_SIZE: 1044 *(uint64_t *)arg = sc->max_xfer_size; 1045 return (0); 1046 case NVMF_RECONNECT_PARAMS: 1047 rp = (struct nvmf_reconnect_params *)arg; 1048 if ((sc->cdata->fcatt & 1) == 0) 1049 rp->cntlid = NVMF_CNTLID_DYNAMIC; 1050 else 1051 rp->cntlid = sc->cdata->ctrlr_id; 1052 memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn)); 1053 return (0); 1054 case NVMF_RECONNECT_HOST: 1055 hh = (struct nvmf_handoff_host *)arg; 1056 return (nvmf_reconnect_host(sc, hh)); 1057 default: 1058 return (ENOTTY); 1059 } 1060 } 1061 1062 static struct cdevsw nvmf_cdevsw = { 1063 .d_version = D_VERSION, 1064 .d_ioctl = nvmf_ioctl 1065 }; 1066 1067 static int 1068 nvmf_modevent(module_t mod, int what, void *arg) 1069 { 1070 switch (what) { 1071 case MOD_LOAD: 1072 return (nvmf_ctl_load()); 1073 case MOD_QUIESCE: 1074 return (0); 1075 case MOD_UNLOAD: 1076 nvmf_ctl_unload(); 1077 destroy_dev_drain(&nvmf_cdevsw); 1078 return (0); 1079 default: 1080 return (EOPNOTSUPP); 1081 } 1082 } 1083 1084 static device_method_t nvmf_methods[] = { 1085 /* Device interface */ 1086 DEVMETHOD(device_probe, nvmf_probe), 1087 DEVMETHOD(device_attach, nvmf_attach), 1088 DEVMETHOD(device_detach, nvmf_detach), 1089 DEVMETHOD_END 1090 }; 1091 1092 driver_t nvme_nvmf_driver = { 1093 "nvme", 1094 nvmf_methods, 1095 sizeof(struct nvmf_softc), 1096 }; 1097 1098 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); 1099 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); 1100