1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bus.h> 10 #include <sys/conf.h> 11 #include <sys/dnv.h> 12 #include <sys/eventhandler.h> 13 #include <sys/lock.h> 14 #include <sys/kernel.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/module.h> 18 #include <sys/mutex.h> 19 #include <sys/nv.h> 20 #include <sys/reboot.h> 21 #include <sys/sx.h> 22 #include <sys/sysctl.h> 23 #include <sys/taskqueue.h> 24 #include <dev/nvme/nvme.h> 25 #include <dev/nvmf/nvmf.h> 26 #include <dev/nvmf/nvmf_transport.h> 27 #include <dev/nvmf/host/nvmf_var.h> 28 29 static struct cdevsw nvmf_cdevsw; 30 31 bool nvmf_fail_disconnect = false; 32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, 33 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); 34 35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); 36 37 static void nvmf_disconnect_task(void *arg, int pending); 38 static void nvmf_shutdown_pre_sync(void *arg, int howto); 39 static void nvmf_shutdown_post_sync(void *arg, int howto); 40 41 void 42 nvmf_complete(void *arg, const struct nvme_completion *cqe) 43 { 44 struct nvmf_completion_status *status = arg; 45 struct mtx *mtx; 46 47 status->cqe = *cqe; 48 mtx = mtx_pool_find(mtxpool_sleep, status); 49 mtx_lock(mtx); 50 status->done = true; 51 mtx_unlock(mtx); 52 wakeup(status); 53 } 54 55 void 56 nvmf_io_complete(void *arg, size_t xfered, int error) 57 { 58 struct nvmf_completion_status *status = arg; 59 struct mtx *mtx; 60 61 status->io_error = error; 62 mtx = mtx_pool_find(mtxpool_sleep, status); 63 mtx_lock(mtx); 64 status->io_done = true; 65 mtx_unlock(mtx); 66 wakeup(status); 67 } 68 69 void 70 nvmf_wait_for_reply(struct nvmf_completion_status *status) 71 { 72 struct mtx *mtx; 73 74 mtx = mtx_pool_find(mtxpool_sleep, status); 75 mtx_lock(mtx); 76 while (!status->done || !status->io_done) 77 mtx_sleep(status, mtx, 0, "nvmfcmd", 0); 78 mtx_unlock(mtx); 79 } 80 81 static int 82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 83 uint64_t *value) 84 { 85 const struct nvmf_fabric_prop_get_rsp *rsp; 86 struct nvmf_completion_status status; 87 88 nvmf_status_init(&status); 89 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, 90 M_WAITOK)) 91 return (ECONNABORTED); 92 nvmf_wait_for_reply(&status); 93 94 if (status.cqe.status != 0) { 95 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", 96 le16toh(status.cqe.status)); 97 return (EIO); 98 } 99 100 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; 101 if (size == 8) 102 *value = le64toh(rsp->value.u64); 103 else 104 *value = le32toh(rsp->value.u32.low); 105 return (0); 106 } 107 108 static int 109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 110 uint64_t value) 111 { 112 struct nvmf_completion_status status; 113 114 nvmf_status_init(&status); 115 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, 116 M_WAITOK)) 117 return (ECONNABORTED); 118 nvmf_wait_for_reply(&status); 119 120 if (status.cqe.status != 0) { 121 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", 122 le16toh(status.cqe.status)); 123 return (EIO); 124 } 125 return (0); 126 } 127 128 static void 129 nvmf_shutdown_controller(struct nvmf_softc *sc) 130 { 131 uint64_t cc; 132 int error; 133 134 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); 135 if (error != 0) { 136 device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); 137 return; 138 } 139 140 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); 141 142 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); 143 if (error != 0) 144 device_printf(sc->dev, 145 "Failed to set CC to trigger shutdown\n"); 146 } 147 148 static void 149 nvmf_check_keep_alive(void *arg) 150 { 151 struct nvmf_softc *sc = arg; 152 int traffic; 153 154 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); 155 if (traffic == 0) { 156 device_printf(sc->dev, 157 "disconnecting due to KeepAlive timeout\n"); 158 nvmf_disconnect(sc); 159 return; 160 } 161 162 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); 163 } 164 165 static void 166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) 167 { 168 struct nvmf_softc *sc = arg; 169 170 atomic_store_int(&sc->ka_active_rx_traffic, 1); 171 if (cqe->status != 0) { 172 device_printf(sc->dev, 173 "KeepAlive response reported status %#x\n", 174 le16toh(cqe->status)); 175 } 176 } 177 178 static void 179 nvmf_send_keep_alive(void *arg) 180 { 181 struct nvmf_softc *sc = arg; 182 int traffic; 183 184 /* 185 * Don't bother sending a KeepAlive command if TKAS is active 186 * and another command has been sent during the interval. 187 */ 188 traffic = atomic_load_int(&sc->ka_active_tx_traffic); 189 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, 190 sc, M_NOWAIT)) 191 device_printf(sc->dev, 192 "Failed to allocate KeepAlive command\n"); 193 194 /* Clear ka_active_tx_traffic after sending the keep alive command. */ 195 atomic_store_int(&sc->ka_active_tx_traffic, 0); 196 197 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); 198 } 199 200 int 201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) 202 { 203 const nvlist_t *const *io; 204 const nvlist_t *admin; 205 nvlist_t *nvl; 206 size_t i, num_io_queues; 207 uint32_t qsize; 208 int error; 209 210 error = nvmf_unpack_ioc_nvlist(nv, &nvl); 211 if (error != 0) 212 return (error); 213 214 if (!nvlist_exists_number(nvl, "trtype") || 215 !nvlist_exists_nvlist(nvl, "admin") || 216 !nvlist_exists_nvlist_array(nvl, "io") || 217 !nvlist_exists_binary(nvl, "cdata")) 218 goto invalid; 219 220 admin = nvlist_get_nvlist(nvl, "admin"); 221 if (!nvmf_validate_qpair_nvlist(admin, false)) 222 goto invalid; 223 if (!nvlist_get_bool(admin, "admin")) 224 goto invalid; 225 226 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 227 if (num_io_queues < 1) 228 goto invalid; 229 for (i = 0; i < num_io_queues; i++) { 230 if (!nvmf_validate_qpair_nvlist(io[i], false)) 231 goto invalid; 232 } 233 234 /* Require all I/O queues to be the same size. */ 235 qsize = nvlist_get_number(io[0], "qsize"); 236 for (i = 1; i < num_io_queues; i++) { 237 if (nvlist_get_number(io[i], "qsize") != qsize) 238 goto invalid; 239 } 240 241 nvlist_get_binary(nvl, "cdata", &i); 242 if (i != sizeof(struct nvme_controller_data)) 243 goto invalid; 244 245 *nvlp = nvl; 246 return (0); 247 invalid: 248 nvlist_destroy(nvl); 249 return (EINVAL); 250 } 251 252 static int 253 nvmf_probe(device_t dev) 254 { 255 const nvlist_t *nvl = device_get_ivars(dev); 256 const struct nvme_controller_data *cdata; 257 258 if (nvl == NULL) 259 return (ENXIO); 260 261 cdata = nvlist_get_binary(nvl, "cdata", NULL); 262 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); 263 return (BUS_PROBE_DEFAULT); 264 } 265 266 static int 267 nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl) 268 { 269 const nvlist_t *const *io; 270 const nvlist_t *admin; 271 uint64_t kato; 272 size_t num_io_queues; 273 enum nvmf_trtype trtype; 274 char name[16]; 275 276 trtype = nvlist_get_number(nvl, "trtype"); 277 admin = nvlist_get_nvlist(nvl, "admin"); 278 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 279 kato = dnvlist_get_number(nvl, "kato", 0); 280 281 /* Setup the admin queue. */ 282 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); 283 if (sc->admin == NULL) { 284 device_printf(sc->dev, "Failed to setup admin queue\n"); 285 return (ENXIO); 286 } 287 288 /* Setup I/O queues. */ 289 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, 290 M_WAITOK | M_ZERO); 291 sc->num_io_queues = num_io_queues; 292 for (u_int i = 0; i < sc->num_io_queues; i++) { 293 snprintf(name, sizeof(name), "I/O queue %u", i); 294 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); 295 if (sc->io[i] == NULL) { 296 device_printf(sc->dev, "Failed to setup I/O queue %u\n", 297 i + 1); 298 return (ENXIO); 299 } 300 } 301 302 /* Start KeepAlive timers. */ 303 if (kato != 0) { 304 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, 305 sc->cdata->ctratt) != 0; 306 sc->ka_rx_sbt = mstosbt(kato); 307 sc->ka_tx_sbt = sc->ka_rx_sbt / 2; 308 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, 309 nvmf_check_keep_alive, sc, C_HARDCLOCK); 310 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, 311 nvmf_send_keep_alive, sc, C_HARDCLOCK); 312 } 313 314 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), 315 sizeof(*sc->cdata)); 316 317 return (0); 318 } 319 320 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, 321 const struct nvme_namespace_data *, void *); 322 323 static bool 324 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, 325 struct nvme_namespace_data *data, uint32_t *nsidp, 326 nvmf_scan_active_ns_cb *cb, void *cb_arg) 327 { 328 struct nvmf_completion_status status; 329 uint32_t nsid; 330 331 nvmf_status_init(&status); 332 nvmf_status_wait_io(&status); 333 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, 334 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { 335 device_printf(sc->dev, 336 "failed to send IDENTIFY active namespaces command\n"); 337 return (false); 338 } 339 nvmf_wait_for_reply(&status); 340 341 if (status.cqe.status != 0) { 342 device_printf(sc->dev, 343 "IDENTIFY active namespaces failed, status %#x\n", 344 le16toh(status.cqe.status)); 345 return (false); 346 } 347 348 if (status.io_error != 0) { 349 device_printf(sc->dev, 350 "IDENTIFY active namespaces failed with I/O error %d\n", 351 status.io_error); 352 return (false); 353 } 354 355 for (u_int i = 0; i < nitems(nslist->ns); i++) { 356 nsid = nslist->ns[i]; 357 if (nsid == 0) { 358 *nsidp = 0; 359 return (true); 360 } 361 362 nvmf_status_init(&status); 363 nvmf_status_wait_io(&status); 364 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 365 &status, nvmf_io_complete, &status, M_WAITOK)) { 366 device_printf(sc->dev, 367 "failed to send IDENTIFY namespace %u command\n", 368 nsid); 369 return (false); 370 } 371 nvmf_wait_for_reply(&status); 372 373 if (status.cqe.status != 0) { 374 device_printf(sc->dev, 375 "IDENTIFY namespace %u failed, status %#x\n", nsid, 376 le16toh(status.cqe.status)); 377 return (false); 378 } 379 380 if (status.io_error != 0) { 381 device_printf(sc->dev, 382 "IDENTIFY namespace %u failed with I/O error %d\n", 383 nsid, status.io_error); 384 return (false); 385 } 386 387 nvme_namespace_data_swapbytes(data); 388 if (!cb(sc, nsid, data, cb_arg)) 389 return (false); 390 } 391 392 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); 393 394 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) 395 *nsidp = 0; 396 else 397 *nsidp = nsid; 398 return (true); 399 } 400 401 static bool 402 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, 403 void *cb_arg) 404 { 405 struct nvme_namespace_data *data; 406 struct nvme_ns_list *nslist; 407 uint32_t nsid; 408 bool retval; 409 410 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); 411 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 412 413 nsid = 0; 414 retval = true; 415 for (;;) { 416 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, 417 cb_arg)) { 418 retval = false; 419 break; 420 } 421 if (nsid == 0) 422 break; 423 } 424 425 free(data, M_NVMF); 426 free(nslist, M_NVMF); 427 return (retval); 428 } 429 430 static bool 431 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, 432 const struct nvme_namespace_data *data, void *arg __unused) 433 { 434 if (sc->ns[nsid - 1] != NULL) { 435 device_printf(sc->dev, 436 "duplicate namespace %u in active namespace list\n", 437 nsid); 438 return (false); 439 } 440 441 /* 442 * As in nvme_ns_construct, a size of zero indicates an 443 * invalid namespace. 444 */ 445 if (data->nsze == 0) { 446 device_printf(sc->dev, 447 "ignoring active namespace %u with zero size\n", nsid); 448 return (true); 449 } 450 451 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 452 453 nvmf_sim_rescan_ns(sc, nsid); 454 return (true); 455 } 456 457 static bool 458 nvmf_add_namespaces(struct nvmf_softc *sc) 459 { 460 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, 461 M_WAITOK | M_ZERO); 462 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); 463 } 464 465 static int 466 nvmf_attach(device_t dev) 467 { 468 struct make_dev_args mda; 469 struct nvmf_softc *sc = device_get_softc(dev); 470 const nvlist_t *nvl = device_get_ivars(dev); 471 const nvlist_t * const *io; 472 struct sysctl_oid *oid; 473 uint64_t val; 474 u_int i; 475 int error; 476 477 if (nvl == NULL) 478 return (ENXIO); 479 480 sc->dev = dev; 481 sc->trtype = nvlist_get_number(nvl, "trtype"); 482 callout_init(&sc->ka_rx_timer, 1); 483 callout_init(&sc->ka_tx_timer, 1); 484 sx_init(&sc->connection_lock, "nvmf connection"); 485 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); 486 487 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), 488 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", 489 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); 490 sc->ioq_oid_list = SYSCTL_CHILDREN(oid); 491 492 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); 493 494 nvmf_init_aer(sc); 495 496 error = nvmf_establish_connection(sc, nvl); 497 if (error != 0) 498 goto out; 499 500 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); 501 if (error != 0) { 502 device_printf(sc->dev, "Failed to fetch CAP\n"); 503 error = ENXIO; 504 goto out; 505 } 506 507 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); 508 if (error != 0) { 509 device_printf(sc->dev, "Failed to fetch VS\n"); 510 error = ENXIO; 511 goto out; 512 } 513 sc->vs = val; 514 515 /* Honor MDTS if it is set. */ 516 sc->max_xfer_size = maxphys; 517 if (sc->cdata->mdts != 0) { 518 sc->max_xfer_size = ulmin(sc->max_xfer_size, 519 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + 520 NVME_CAP_HI_MPSMIN(sc->cap >> 32))); 521 } 522 523 io = nvlist_get_nvlist_array(nvl, "io", NULL); 524 sc->max_pending_io = nvlist_get_number(io[0], "qsize") * 525 sc->num_io_queues; 526 527 error = nvmf_init_sim(sc); 528 if (error != 0) 529 goto out; 530 531 error = nvmf_start_aer(sc); 532 if (error != 0) { 533 nvmf_destroy_sim(sc); 534 goto out; 535 } 536 537 if (!nvmf_add_namespaces(sc)) { 538 nvmf_destroy_sim(sc); 539 goto out; 540 } 541 542 make_dev_args_init(&mda); 543 mda.mda_devsw = &nvmf_cdevsw; 544 mda.mda_uid = UID_ROOT; 545 mda.mda_gid = GID_WHEEL; 546 mda.mda_mode = 0600; 547 mda.mda_si_drv1 = sc; 548 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); 549 if (error != 0) { 550 nvmf_destroy_sim(sc); 551 goto out; 552 } 553 554 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, 555 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); 556 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, 557 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST); 558 559 return (0); 560 out: 561 if (sc->ns != NULL) { 562 for (i = 0; i < sc->cdata->nn; i++) { 563 if (sc->ns[i] != NULL) 564 nvmf_destroy_ns(sc->ns[i]); 565 } 566 free(sc->ns, M_NVMF); 567 } 568 569 callout_drain(&sc->ka_tx_timer); 570 callout_drain(&sc->ka_rx_timer); 571 572 if (sc->admin != NULL) 573 nvmf_shutdown_controller(sc); 574 575 for (i = 0; i < sc->num_io_queues; i++) { 576 if (sc->io[i] != NULL) 577 nvmf_destroy_qp(sc->io[i]); 578 } 579 free(sc->io, M_NVMF); 580 if (sc->admin != NULL) 581 nvmf_destroy_qp(sc->admin); 582 583 nvmf_destroy_aer(sc); 584 585 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 586 sx_destroy(&sc->connection_lock); 587 free(sc->cdata, M_NVMF); 588 return (error); 589 } 590 591 void 592 nvmf_disconnect(struct nvmf_softc *sc) 593 { 594 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); 595 } 596 597 static void 598 nvmf_disconnect_task(void *arg, int pending __unused) 599 { 600 struct nvmf_softc *sc = arg; 601 u_int i; 602 603 sx_xlock(&sc->connection_lock); 604 if (sc->admin == NULL) { 605 /* 606 * Ignore transport errors if there is no active 607 * association. 608 */ 609 sx_xunlock(&sc->connection_lock); 610 return; 611 } 612 613 if (sc->detaching) { 614 if (sc->admin != NULL) { 615 /* 616 * This unsticks the detach process if a 617 * transport error occurs during detach. 618 */ 619 nvmf_shutdown_qp(sc->admin); 620 } 621 sx_xunlock(&sc->connection_lock); 622 return; 623 } 624 625 if (sc->cdev == NULL) { 626 /* 627 * Transport error occurred during attach (nvmf_add_namespaces). 628 * Shutdown the admin queue. 629 */ 630 nvmf_shutdown_qp(sc->admin); 631 sx_xunlock(&sc->connection_lock); 632 return; 633 } 634 635 callout_drain(&sc->ka_tx_timer); 636 callout_drain(&sc->ka_rx_timer); 637 sc->ka_traffic = false; 638 639 /* Quiesce namespace consumers. */ 640 nvmf_disconnect_sim(sc); 641 for (i = 0; i < sc->cdata->nn; i++) { 642 if (sc->ns[i] != NULL) 643 nvmf_disconnect_ns(sc->ns[i]); 644 } 645 646 /* Shutdown the existing qpairs. */ 647 for (i = 0; i < sc->num_io_queues; i++) { 648 nvmf_destroy_qp(sc->io[i]); 649 } 650 free(sc->io, M_NVMF); 651 sc->io = NULL; 652 sc->num_io_queues = 0; 653 nvmf_destroy_qp(sc->admin); 654 sc->admin = NULL; 655 656 sx_xunlock(&sc->connection_lock); 657 } 658 659 static int 660 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 661 { 662 const struct nvme_controller_data *cdata; 663 nvlist_t *nvl; 664 u_int i; 665 int error; 666 667 error = nvmf_copyin_handoff(nv, &nvl); 668 if (error != 0) 669 return (error); 670 671 /* XXX: Should we permit changing the transport type? */ 672 if (sc->trtype != nvlist_get_number(nvl, "trtype")) { 673 device_printf(sc->dev, 674 "transport type mismatch on reconnect\n"); 675 return (EINVAL); 676 } 677 678 sx_xlock(&sc->connection_lock); 679 if (sc->admin != NULL || sc->detaching) { 680 error = EBUSY; 681 goto out; 682 } 683 684 /* 685 * Ensure this is for the same controller. Note that the 686 * controller ID can vary across associations if the remote 687 * system is using the dynamic controller model. This merely 688 * ensures the new association is connected to the same NVMe 689 * subsystem. 690 */ 691 cdata = nvlist_get_binary(nvl, "cdata", NULL); 692 if (memcmp(sc->cdata->subnqn, cdata->subnqn, 693 sizeof(cdata->subnqn)) != 0) { 694 device_printf(sc->dev, 695 "controller subsystem NQN mismatch on reconnect\n"); 696 error = EINVAL; 697 goto out; 698 } 699 700 /* 701 * XXX: Require same number and size of I/O queues so that 702 * max_pending_io is still correct? 703 */ 704 705 error = nvmf_establish_connection(sc, nvl); 706 if (error != 0) 707 goto out; 708 709 error = nvmf_start_aer(sc); 710 if (error != 0) 711 goto out; 712 713 device_printf(sc->dev, 714 "established new association with %u I/O queues\n", 715 sc->num_io_queues); 716 717 /* Restart namespace consumers. */ 718 for (i = 0; i < sc->cdata->nn; i++) { 719 if (sc->ns[i] != NULL) 720 nvmf_reconnect_ns(sc->ns[i]); 721 } 722 nvmf_reconnect_sim(sc); 723 724 nvmf_rescan_all_ns(sc); 725 out: 726 sx_xunlock(&sc->connection_lock); 727 nvlist_destroy(nvl); 728 return (error); 729 } 730 731 static void 732 nvmf_shutdown_pre_sync(void *arg, int howto) 733 { 734 struct nvmf_softc *sc = arg; 735 736 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 737 return; 738 739 /* 740 * If this association is disconnected, abort any pending 741 * requests with an error to permit filesystems to unmount 742 * without hanging. 743 */ 744 sx_xlock(&sc->connection_lock); 745 if (sc->admin != NULL || sc->detaching) { 746 sx_xunlock(&sc->connection_lock); 747 return; 748 } 749 750 for (u_int i = 0; i < sc->cdata->nn; i++) { 751 if (sc->ns[i] != NULL) 752 nvmf_shutdown_ns(sc->ns[i]); 753 } 754 nvmf_shutdown_sim(sc); 755 sx_xunlock(&sc->connection_lock); 756 } 757 758 static void 759 nvmf_shutdown_post_sync(void *arg, int howto) 760 { 761 struct nvmf_softc *sc = arg; 762 763 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 764 return; 765 766 /* 767 * If this association is connected, disconnect gracefully. 768 */ 769 sx_xlock(&sc->connection_lock); 770 if (sc->admin == NULL || sc->detaching) { 771 sx_xunlock(&sc->connection_lock); 772 return; 773 } 774 775 callout_drain(&sc->ka_tx_timer); 776 callout_drain(&sc->ka_rx_timer); 777 778 nvmf_shutdown_controller(sc); 779 for (u_int i = 0; i < sc->num_io_queues; i++) { 780 nvmf_destroy_qp(sc->io[i]); 781 } 782 nvmf_destroy_qp(sc->admin); 783 sc->admin = NULL; 784 sx_xunlock(&sc->connection_lock); 785 } 786 787 static int 788 nvmf_detach(device_t dev) 789 { 790 struct nvmf_softc *sc = device_get_softc(dev); 791 u_int i; 792 793 destroy_dev(sc->cdev); 794 795 sx_xlock(&sc->connection_lock); 796 sc->detaching = true; 797 sx_xunlock(&sc->connection_lock); 798 799 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); 800 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); 801 802 nvmf_destroy_sim(sc); 803 for (i = 0; i < sc->cdata->nn; i++) { 804 if (sc->ns[i] != NULL) 805 nvmf_destroy_ns(sc->ns[i]); 806 } 807 free(sc->ns, M_NVMF); 808 809 callout_drain(&sc->ka_tx_timer); 810 callout_drain(&sc->ka_rx_timer); 811 812 if (sc->admin != NULL) 813 nvmf_shutdown_controller(sc); 814 815 for (i = 0; i < sc->num_io_queues; i++) { 816 nvmf_destroy_qp(sc->io[i]); 817 } 818 free(sc->io, M_NVMF); 819 820 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 821 822 if (sc->admin != NULL) 823 nvmf_destroy_qp(sc->admin); 824 825 nvmf_destroy_aer(sc); 826 827 sx_destroy(&sc->connection_lock); 828 free(sc->cdata, M_NVMF); 829 return (0); 830 } 831 832 static void 833 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, 834 const struct nvme_namespace_data *data) 835 { 836 struct nvmf_namespace *ns; 837 838 /* XXX: Needs locking around sc->ns[]. */ 839 ns = sc->ns[nsid - 1]; 840 if (data->nsze == 0) { 841 /* XXX: Needs locking */ 842 if (ns != NULL) { 843 nvmf_destroy_ns(ns); 844 sc->ns[nsid - 1] = NULL; 845 } 846 } else { 847 /* XXX: Needs locking */ 848 if (ns == NULL) { 849 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 850 } else { 851 if (!nvmf_update_ns(ns, data)) { 852 nvmf_destroy_ns(ns); 853 sc->ns[nsid - 1] = NULL; 854 } 855 } 856 } 857 858 nvmf_sim_rescan_ns(sc, nsid); 859 } 860 861 void 862 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) 863 { 864 struct nvmf_completion_status status; 865 struct nvme_namespace_data *data; 866 867 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 868 869 nvmf_status_init(&status); 870 nvmf_status_wait_io(&status); 871 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 872 &status, nvmf_io_complete, &status, M_WAITOK)) { 873 device_printf(sc->dev, 874 "failed to send IDENTIFY namespace %u command\n", nsid); 875 free(data, M_NVMF); 876 return; 877 } 878 nvmf_wait_for_reply(&status); 879 880 if (status.cqe.status != 0) { 881 device_printf(sc->dev, 882 "IDENTIFY namespace %u failed, status %#x\n", nsid, 883 le16toh(status.cqe.status)); 884 free(data, M_NVMF); 885 return; 886 } 887 888 if (status.io_error != 0) { 889 device_printf(sc->dev, 890 "IDENTIFY namespace %u failed with I/O error %d\n", 891 nsid, status.io_error); 892 free(data, M_NVMF); 893 return; 894 } 895 896 nvme_namespace_data_swapbytes(data); 897 898 nvmf_rescan_ns_1(sc, nsid, data); 899 900 free(data, M_NVMF); 901 } 902 903 static void 904 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, 905 uint32_t next_valid_nsid) 906 { 907 struct nvmf_namespace *ns; 908 909 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) 910 { 911 /* XXX: Needs locking around sc->ns[]. */ 912 ns = sc->ns[nsid - 1]; 913 if (ns != NULL) { 914 nvmf_destroy_ns(ns); 915 sc->ns[nsid - 1] = NULL; 916 917 nvmf_sim_rescan_ns(sc, nsid); 918 } 919 } 920 } 921 922 static bool 923 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, 924 const struct nvme_namespace_data *data, void *arg) 925 { 926 uint32_t *last_nsid = arg; 927 928 /* Check for any gaps prior to this namespace. */ 929 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); 930 *last_nsid = nsid; 931 932 nvmf_rescan_ns_1(sc, nsid, data); 933 return (true); 934 } 935 936 void 937 nvmf_rescan_all_ns(struct nvmf_softc *sc) 938 { 939 uint32_t last_nsid; 940 941 last_nsid = 0; 942 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) 943 return; 944 945 /* 946 * Check for any namespace devices after the last active 947 * namespace. 948 */ 949 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); 950 } 951 952 int 953 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, 954 bool admin) 955 { 956 struct nvmf_completion_status status; 957 struct nvme_command cmd; 958 struct memdesc mem; 959 struct nvmf_host_qpair *qp; 960 struct nvmf_request *req; 961 void *buf; 962 int error; 963 964 if (pt->len > sc->max_xfer_size) 965 return (EINVAL); 966 967 buf = NULL; 968 if (pt->len != 0) { 969 /* 970 * XXX: Depending on the size we may want to pin the 971 * user pages and use a memdesc with vm_page_t's 972 * instead. 973 */ 974 buf = malloc(pt->len, M_NVMF, M_WAITOK); 975 if (pt->is_read == 0) { 976 error = copyin(pt->buf, buf, pt->len); 977 if (error != 0) { 978 free(buf, M_NVMF); 979 return (error); 980 } 981 } else { 982 /* Ensure no kernel data is leaked to userland. */ 983 memset(buf, 0, pt->len); 984 } 985 } 986 987 memset(&cmd, 0, sizeof(cmd)); 988 cmd.opc = pt->cmd.opc; 989 cmd.fuse = pt->cmd.fuse; 990 cmd.nsid = pt->cmd.nsid; 991 cmd.cdw10 = pt->cmd.cdw10; 992 cmd.cdw11 = pt->cmd.cdw11; 993 cmd.cdw12 = pt->cmd.cdw12; 994 cmd.cdw13 = pt->cmd.cdw13; 995 cmd.cdw14 = pt->cmd.cdw14; 996 cmd.cdw15 = pt->cmd.cdw15; 997 998 sx_slock(&sc->connection_lock); 999 if (sc->admin == NULL || sc->detaching) { 1000 device_printf(sc->dev, 1001 "failed to send passthrough command\n"); 1002 error = ECONNABORTED; 1003 sx_sunlock(&sc->connection_lock); 1004 goto error; 1005 } 1006 if (admin) 1007 qp = sc->admin; 1008 else 1009 qp = nvmf_select_io_queue(sc); 1010 nvmf_status_init(&status); 1011 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); 1012 sx_sunlock(&sc->connection_lock); 1013 if (req == NULL) { 1014 device_printf(sc->dev, "failed to send passthrough command\n"); 1015 error = ECONNABORTED; 1016 goto error; 1017 } 1018 1019 if (pt->len != 0) { 1020 mem = memdesc_vaddr(buf, pt->len); 1021 nvmf_capsule_append_data(req->nc, &mem, pt->len, 1022 pt->is_read == 0, nvmf_io_complete, &status); 1023 nvmf_status_wait_io(&status); 1024 } 1025 1026 nvmf_submit_request(req); 1027 nvmf_wait_for_reply(&status); 1028 1029 memset(&pt->cpl, 0, sizeof(pt->cpl)); 1030 pt->cpl.cdw0 = status.cqe.cdw0; 1031 pt->cpl.status = status.cqe.status; 1032 1033 error = status.io_error; 1034 if (error == 0 && pt->len != 0 && pt->is_read != 0) 1035 error = copyout(buf, pt->buf, pt->len); 1036 error: 1037 free(buf, M_NVMF); 1038 return (error); 1039 } 1040 1041 static int 1042 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1043 { 1044 nvlist_t *nvl; 1045 int error; 1046 1047 nvl = nvlist_create(0); 1048 1049 sx_slock(&sc->connection_lock); 1050 if ((sc->cdata->fcatt & 1) == 0) 1051 nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC); 1052 else 1053 nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id); 1054 nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn); 1055 sx_sunlock(&sc->connection_lock); 1056 1057 error = nvmf_pack_ioc_nvlist(nvl, nv); 1058 nvlist_destroy(nvl); 1059 return (error); 1060 } 1061 1062 static int 1063 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1064 struct thread *td) 1065 { 1066 struct nvmf_softc *sc = cdev->si_drv1; 1067 struct nvme_get_nsid *gnsid; 1068 struct nvme_pt_command *pt; 1069 struct nvmf_ioc_nv *nv; 1070 1071 switch (cmd) { 1072 case NVME_PASSTHROUGH_CMD: 1073 pt = (struct nvme_pt_command *)arg; 1074 return (nvmf_passthrough_cmd(sc, pt, true)); 1075 case NVME_GET_NSID: 1076 gnsid = (struct nvme_get_nsid *)arg; 1077 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), 1078 sizeof(gnsid->cdev)); 1079 gnsid->nsid = 0; 1080 return (0); 1081 case NVME_GET_MAX_XFER_SIZE: 1082 *(uint64_t *)arg = sc->max_xfer_size; 1083 return (0); 1084 case NVMF_RECONNECT_PARAMS: 1085 nv = (struct nvmf_ioc_nv *)arg; 1086 return (nvmf_reconnect_params(sc, nv)); 1087 case NVMF_RECONNECT_HOST: 1088 nv = (struct nvmf_ioc_nv *)arg; 1089 return (nvmf_reconnect_host(sc, nv)); 1090 default: 1091 return (ENOTTY); 1092 } 1093 } 1094 1095 static struct cdevsw nvmf_cdevsw = { 1096 .d_version = D_VERSION, 1097 .d_ioctl = nvmf_ioctl 1098 }; 1099 1100 static int 1101 nvmf_modevent(module_t mod, int what, void *arg) 1102 { 1103 switch (what) { 1104 case MOD_LOAD: 1105 return (nvmf_ctl_load()); 1106 case MOD_QUIESCE: 1107 return (0); 1108 case MOD_UNLOAD: 1109 nvmf_ctl_unload(); 1110 destroy_dev_drain(&nvmf_cdevsw); 1111 return (0); 1112 default: 1113 return (EOPNOTSUPP); 1114 } 1115 } 1116 1117 static device_method_t nvmf_methods[] = { 1118 /* Device interface */ 1119 DEVMETHOD(device_probe, nvmf_probe), 1120 DEVMETHOD(device_attach, nvmf_attach), 1121 DEVMETHOD(device_detach, nvmf_detach), 1122 DEVMETHOD_END 1123 }; 1124 1125 driver_t nvme_nvmf_driver = { 1126 "nvme", 1127 nvmf_methods, 1128 sizeof(struct nvmf_softc), 1129 }; 1130 1131 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); 1132 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); 1133