1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bus.h> 10 #include <sys/conf.h> 11 #include <sys/dnv.h> 12 #include <sys/eventhandler.h> 13 #include <sys/lock.h> 14 #include <sys/kernel.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/module.h> 18 #include <sys/mutex.h> 19 #include <sys/nv.h> 20 #include <sys/reboot.h> 21 #include <sys/sx.h> 22 #include <sys/sysctl.h> 23 #include <sys/taskqueue.h> 24 #include <dev/nvme/nvme.h> 25 #include <dev/nvmf/nvmf.h> 26 #include <dev/nvmf/nvmf_transport.h> 27 #include <dev/nvmf/host/nvmf_var.h> 28 29 static struct cdevsw nvmf_cdevsw; 30 31 bool nvmf_fail_disconnect = false; 32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, 33 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); 34 35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); 36 37 static void nvmf_disconnect_task(void *arg, int pending); 38 static void nvmf_shutdown_pre_sync(void *arg, int howto); 39 static void nvmf_shutdown_post_sync(void *arg, int howto); 40 41 void 42 nvmf_complete(void *arg, const struct nvme_completion *cqe) 43 { 44 struct nvmf_completion_status *status = arg; 45 struct mtx *mtx; 46 47 status->cqe = *cqe; 48 mtx = mtx_pool_find(mtxpool_sleep, status); 49 mtx_lock(mtx); 50 status->done = true; 51 mtx_unlock(mtx); 52 wakeup(status); 53 } 54 55 void 56 nvmf_io_complete(void *arg, size_t xfered, int error) 57 { 58 struct nvmf_completion_status *status = arg; 59 struct mtx *mtx; 60 61 status->io_error = error; 62 mtx = mtx_pool_find(mtxpool_sleep, status); 63 mtx_lock(mtx); 64 status->io_done = true; 65 mtx_unlock(mtx); 66 wakeup(status); 67 } 68 69 void 70 nvmf_wait_for_reply(struct nvmf_completion_status *status) 71 { 72 struct mtx *mtx; 73 74 mtx = mtx_pool_find(mtxpool_sleep, status); 75 mtx_lock(mtx); 76 while (!status->done || !status->io_done) 77 mtx_sleep(status, mtx, 0, "nvmfcmd", 0); 78 mtx_unlock(mtx); 79 } 80 81 static int 82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 83 uint64_t *value) 84 { 85 const struct nvmf_fabric_prop_get_rsp *rsp; 86 struct nvmf_completion_status status; 87 88 nvmf_status_init(&status); 89 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, 90 M_WAITOK)) 91 return (ECONNABORTED); 92 nvmf_wait_for_reply(&status); 93 94 if (status.cqe.status != 0) { 95 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", 96 le16toh(status.cqe.status)); 97 return (EIO); 98 } 99 100 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; 101 if (size == 8) 102 *value = le64toh(rsp->value.u64); 103 else 104 *value = le32toh(rsp->value.u32.low); 105 return (0); 106 } 107 108 static int 109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 110 uint64_t value) 111 { 112 struct nvmf_completion_status status; 113 114 nvmf_status_init(&status); 115 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, 116 M_WAITOK)) 117 return (ECONNABORTED); 118 nvmf_wait_for_reply(&status); 119 120 if (status.cqe.status != 0) { 121 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", 122 le16toh(status.cqe.status)); 123 return (EIO); 124 } 125 return (0); 126 } 127 128 static void 129 nvmf_shutdown_controller(struct nvmf_softc *sc) 130 { 131 uint64_t cc; 132 int error; 133 134 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); 135 if (error != 0) { 136 device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); 137 return; 138 } 139 140 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); 141 142 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); 143 if (error != 0) 144 device_printf(sc->dev, 145 "Failed to set CC to trigger shutdown\n"); 146 } 147 148 static void 149 nvmf_check_keep_alive(void *arg) 150 { 151 struct nvmf_softc *sc = arg; 152 int traffic; 153 154 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); 155 if (traffic == 0) { 156 device_printf(sc->dev, 157 "disconnecting due to KeepAlive timeout\n"); 158 nvmf_disconnect(sc); 159 return; 160 } 161 162 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); 163 } 164 165 static void 166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) 167 { 168 struct nvmf_softc *sc = arg; 169 170 atomic_store_int(&sc->ka_active_rx_traffic, 1); 171 if (cqe->status != 0) { 172 device_printf(sc->dev, 173 "KeepAlive response reported status %#x\n", 174 le16toh(cqe->status)); 175 } 176 } 177 178 static void 179 nvmf_send_keep_alive(void *arg) 180 { 181 struct nvmf_softc *sc = arg; 182 int traffic; 183 184 /* 185 * Don't bother sending a KeepAlive command if TKAS is active 186 * and another command has been sent during the interval. 187 */ 188 traffic = atomic_load_int(&sc->ka_active_tx_traffic); 189 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, 190 sc, M_NOWAIT)) 191 device_printf(sc->dev, 192 "Failed to allocate KeepAlive command\n"); 193 194 /* Clear ka_active_tx_traffic after sending the keep alive command. */ 195 atomic_store_int(&sc->ka_active_tx_traffic, 0); 196 197 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); 198 } 199 200 int 201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) 202 { 203 const struct nvme_discovery_log_entry *dle; 204 const struct nvme_controller_data *cdata; 205 const nvlist_t *const *io; 206 const nvlist_t *admin, *rparams; 207 nvlist_t *nvl; 208 size_t i, num_io_queues; 209 uint32_t qsize; 210 int error; 211 212 error = nvmf_unpack_ioc_nvlist(nv, &nvl); 213 if (error != 0) 214 return (error); 215 216 if (!nvlist_exists_number(nvl, "trtype") || 217 !nvlist_exists_nvlist(nvl, "admin") || 218 !nvlist_exists_nvlist_array(nvl, "io") || 219 !nvlist_exists_binary(nvl, "cdata") || 220 !nvlist_exists_nvlist(nvl, "rparams")) 221 goto invalid; 222 223 rparams = nvlist_get_nvlist(nvl, "rparams"); 224 if (!nvlist_exists_binary(rparams, "dle") || 225 !nvlist_exists_string(rparams, "hostnqn") || 226 !nvlist_exists_number(rparams, "num_io_queues") || 227 !nvlist_exists_number(rparams, "io_qsize")) 228 goto invalid; 229 230 admin = nvlist_get_nvlist(nvl, "admin"); 231 if (!nvmf_validate_qpair_nvlist(admin, false)) 232 goto invalid; 233 if (!nvlist_get_bool(admin, "admin")) 234 goto invalid; 235 236 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 237 if (num_io_queues < 1 || 238 num_io_queues != nvlist_get_number(rparams, "num_io_queues")) 239 goto invalid; 240 for (i = 0; i < num_io_queues; i++) { 241 if (!nvmf_validate_qpair_nvlist(io[i], false)) 242 goto invalid; 243 } 244 245 /* Require all I/O queues to be the same size. */ 246 qsize = nvlist_get_number(rparams, "io_qsize"); 247 for (i = 0; i < num_io_queues; i++) { 248 if (nvlist_get_number(io[i], "qsize") != qsize) 249 goto invalid; 250 } 251 252 cdata = nvlist_get_binary(nvl, "cdata", &i); 253 if (i != sizeof(*cdata)) 254 goto invalid; 255 dle = nvlist_get_binary(rparams, "dle", &i); 256 if (i != sizeof(*dle)) 257 goto invalid; 258 259 if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0) 260 goto invalid; 261 262 *nvlp = nvl; 263 return (0); 264 invalid: 265 nvlist_destroy(nvl); 266 return (EINVAL); 267 } 268 269 static int 270 nvmf_probe(device_t dev) 271 { 272 const nvlist_t *nvl = device_get_ivars(dev); 273 const struct nvme_controller_data *cdata; 274 275 if (nvl == NULL) 276 return (ENXIO); 277 278 cdata = nvlist_get_binary(nvl, "cdata", NULL); 279 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); 280 return (BUS_PROBE_DEFAULT); 281 } 282 283 static int 284 nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl) 285 { 286 const nvlist_t *const *io; 287 const nvlist_t *admin; 288 uint64_t kato; 289 size_t num_io_queues; 290 enum nvmf_trtype trtype; 291 char name[16]; 292 293 trtype = nvlist_get_number(nvl, "trtype"); 294 admin = nvlist_get_nvlist(nvl, "admin"); 295 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 296 kato = dnvlist_get_number(nvl, "kato", 0); 297 298 /* Setup the admin queue. */ 299 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); 300 if (sc->admin == NULL) { 301 device_printf(sc->dev, "Failed to setup admin queue\n"); 302 return (ENXIO); 303 } 304 305 /* Setup I/O queues. */ 306 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, 307 M_WAITOK | M_ZERO); 308 sc->num_io_queues = num_io_queues; 309 for (u_int i = 0; i < sc->num_io_queues; i++) { 310 snprintf(name, sizeof(name), "I/O queue %u", i); 311 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); 312 if (sc->io[i] == NULL) { 313 device_printf(sc->dev, "Failed to setup I/O queue %u\n", 314 i); 315 return (ENXIO); 316 } 317 } 318 319 /* Start KeepAlive timers. */ 320 if (kato != 0) { 321 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, 322 sc->cdata->ctratt) != 0; 323 sc->ka_rx_sbt = mstosbt(kato); 324 sc->ka_tx_sbt = sc->ka_rx_sbt / 2; 325 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, 326 nvmf_check_keep_alive, sc, C_HARDCLOCK); 327 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, 328 nvmf_send_keep_alive, sc, C_HARDCLOCK); 329 } 330 331 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), 332 sizeof(*sc->cdata)); 333 334 /* Save reconnect parameters. */ 335 nvlist_destroy(sc->rparams); 336 sc->rparams = nvlist_take_nvlist(nvl, "rparams"); 337 338 return (0); 339 } 340 341 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, 342 const struct nvme_namespace_data *, void *); 343 344 static bool 345 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, 346 struct nvme_namespace_data *data, uint32_t *nsidp, 347 nvmf_scan_active_ns_cb *cb, void *cb_arg) 348 { 349 struct nvmf_completion_status status; 350 uint32_t nsid; 351 352 nvmf_status_init(&status); 353 nvmf_status_wait_io(&status); 354 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, 355 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { 356 device_printf(sc->dev, 357 "failed to send IDENTIFY active namespaces command\n"); 358 return (false); 359 } 360 nvmf_wait_for_reply(&status); 361 362 if (status.cqe.status != 0) { 363 device_printf(sc->dev, 364 "IDENTIFY active namespaces failed, status %#x\n", 365 le16toh(status.cqe.status)); 366 return (false); 367 } 368 369 if (status.io_error != 0) { 370 device_printf(sc->dev, 371 "IDENTIFY active namespaces failed with I/O error %d\n", 372 status.io_error); 373 return (false); 374 } 375 376 for (u_int i = 0; i < nitems(nslist->ns); i++) { 377 nsid = nslist->ns[i]; 378 if (nsid == 0) { 379 *nsidp = 0; 380 return (true); 381 } 382 383 nvmf_status_init(&status); 384 nvmf_status_wait_io(&status); 385 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 386 &status, nvmf_io_complete, &status, M_WAITOK)) { 387 device_printf(sc->dev, 388 "failed to send IDENTIFY namespace %u command\n", 389 nsid); 390 return (false); 391 } 392 nvmf_wait_for_reply(&status); 393 394 if (status.cqe.status != 0) { 395 device_printf(sc->dev, 396 "IDENTIFY namespace %u failed, status %#x\n", nsid, 397 le16toh(status.cqe.status)); 398 return (false); 399 } 400 401 if (status.io_error != 0) { 402 device_printf(sc->dev, 403 "IDENTIFY namespace %u failed with I/O error %d\n", 404 nsid, status.io_error); 405 return (false); 406 } 407 408 nvme_namespace_data_swapbytes(data); 409 if (!cb(sc, nsid, data, cb_arg)) 410 return (false); 411 } 412 413 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); 414 415 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) 416 *nsidp = 0; 417 else 418 *nsidp = nsid; 419 return (true); 420 } 421 422 static bool 423 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, 424 void *cb_arg) 425 { 426 struct nvme_namespace_data *data; 427 struct nvme_ns_list *nslist; 428 uint32_t nsid; 429 bool retval; 430 431 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); 432 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 433 434 nsid = 0; 435 retval = true; 436 for (;;) { 437 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, 438 cb_arg)) { 439 retval = false; 440 break; 441 } 442 if (nsid == 0) 443 break; 444 } 445 446 free(data, M_NVMF); 447 free(nslist, M_NVMF); 448 return (retval); 449 } 450 451 static bool 452 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, 453 const struct nvme_namespace_data *data, void *arg __unused) 454 { 455 if (sc->ns[nsid - 1] != NULL) { 456 device_printf(sc->dev, 457 "duplicate namespace %u in active namespace list\n", 458 nsid); 459 return (false); 460 } 461 462 /* 463 * As in nvme_ns_construct, a size of zero indicates an 464 * invalid namespace. 465 */ 466 if (data->nsze == 0) { 467 device_printf(sc->dev, 468 "ignoring active namespace %u with zero size\n", nsid); 469 return (true); 470 } 471 472 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 473 474 nvmf_sim_rescan_ns(sc, nsid); 475 return (true); 476 } 477 478 static bool 479 nvmf_add_namespaces(struct nvmf_softc *sc) 480 { 481 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, 482 M_WAITOK | M_ZERO); 483 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); 484 } 485 486 static int 487 nvmf_attach(device_t dev) 488 { 489 struct make_dev_args mda; 490 struct nvmf_softc *sc = device_get_softc(dev); 491 nvlist_t *nvl = device_get_ivars(dev); 492 const nvlist_t * const *io; 493 struct sysctl_oid *oid; 494 uint64_t val; 495 u_int i; 496 int error; 497 498 if (nvl == NULL) 499 return (ENXIO); 500 501 sc->dev = dev; 502 sc->trtype = nvlist_get_number(nvl, "trtype"); 503 callout_init(&sc->ka_rx_timer, 1); 504 callout_init(&sc->ka_tx_timer, 1); 505 sx_init(&sc->connection_lock, "nvmf connection"); 506 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); 507 508 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), 509 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", 510 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); 511 sc->ioq_oid_list = SYSCTL_CHILDREN(oid); 512 513 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); 514 515 nvmf_init_aer(sc); 516 517 error = nvmf_establish_connection(sc, nvl); 518 if (error != 0) 519 goto out; 520 521 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); 522 if (error != 0) { 523 device_printf(sc->dev, "Failed to fetch CAP\n"); 524 error = ENXIO; 525 goto out; 526 } 527 528 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); 529 if (error != 0) { 530 device_printf(sc->dev, "Failed to fetch VS\n"); 531 error = ENXIO; 532 goto out; 533 } 534 sc->vs = val; 535 536 /* Honor MDTS if it is set. */ 537 sc->max_xfer_size = maxphys; 538 if (sc->cdata->mdts != 0) { 539 sc->max_xfer_size = ulmin(sc->max_xfer_size, 540 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + 541 NVME_CAP_HI_MPSMIN(sc->cap >> 32))); 542 } 543 544 io = nvlist_get_nvlist_array(nvl, "io", NULL); 545 sc->max_pending_io = nvlist_get_number(io[0], "qsize") * 546 sc->num_io_queues; 547 548 error = nvmf_init_sim(sc); 549 if (error != 0) 550 goto out; 551 552 error = nvmf_start_aer(sc); 553 if (error != 0) { 554 nvmf_destroy_sim(sc); 555 goto out; 556 } 557 558 if (!nvmf_add_namespaces(sc)) { 559 nvmf_destroy_sim(sc); 560 goto out; 561 } 562 563 make_dev_args_init(&mda); 564 mda.mda_devsw = &nvmf_cdevsw; 565 mda.mda_uid = UID_ROOT; 566 mda.mda_gid = GID_WHEEL; 567 mda.mda_mode = 0600; 568 mda.mda_si_drv1 = sc; 569 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); 570 if (error != 0) { 571 nvmf_destroy_sim(sc); 572 goto out; 573 } 574 575 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, 576 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); 577 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, 578 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST); 579 580 return (0); 581 out: 582 if (sc->ns != NULL) { 583 for (i = 0; i < sc->cdata->nn; i++) { 584 if (sc->ns[i] != NULL) 585 nvmf_destroy_ns(sc->ns[i]); 586 } 587 free(sc->ns, M_NVMF); 588 } 589 590 callout_drain(&sc->ka_tx_timer); 591 callout_drain(&sc->ka_rx_timer); 592 593 if (sc->admin != NULL) 594 nvmf_shutdown_controller(sc); 595 596 for (i = 0; i < sc->num_io_queues; i++) { 597 if (sc->io[i] != NULL) 598 nvmf_destroy_qp(sc->io[i]); 599 } 600 free(sc->io, M_NVMF); 601 if (sc->admin != NULL) 602 nvmf_destroy_qp(sc->admin); 603 604 nvmf_destroy_aer(sc); 605 606 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 607 sx_destroy(&sc->connection_lock); 608 nvlist_destroy(sc->rparams); 609 free(sc->cdata, M_NVMF); 610 return (error); 611 } 612 613 void 614 nvmf_disconnect(struct nvmf_softc *sc) 615 { 616 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); 617 } 618 619 static void 620 nvmf_disconnect_task(void *arg, int pending __unused) 621 { 622 struct nvmf_softc *sc = arg; 623 u_int i; 624 625 sx_xlock(&sc->connection_lock); 626 if (sc->admin == NULL) { 627 /* 628 * Ignore transport errors if there is no active 629 * association. 630 */ 631 sx_xunlock(&sc->connection_lock); 632 return; 633 } 634 635 if (sc->detaching) { 636 if (sc->admin != NULL) { 637 /* 638 * This unsticks the detach process if a 639 * transport error occurs during detach. 640 */ 641 nvmf_shutdown_qp(sc->admin); 642 } 643 sx_xunlock(&sc->connection_lock); 644 return; 645 } 646 647 if (sc->cdev == NULL) { 648 /* 649 * Transport error occurred during attach (nvmf_add_namespaces). 650 * Shutdown the admin queue. 651 */ 652 nvmf_shutdown_qp(sc->admin); 653 sx_xunlock(&sc->connection_lock); 654 return; 655 } 656 657 nanotime(&sc->last_disconnect); 658 callout_drain(&sc->ka_tx_timer); 659 callout_drain(&sc->ka_rx_timer); 660 sc->ka_traffic = false; 661 662 /* Quiesce namespace consumers. */ 663 nvmf_disconnect_sim(sc); 664 for (i = 0; i < sc->cdata->nn; i++) { 665 if (sc->ns[i] != NULL) 666 nvmf_disconnect_ns(sc->ns[i]); 667 } 668 669 /* Shutdown the existing qpairs. */ 670 for (i = 0; i < sc->num_io_queues; i++) { 671 nvmf_destroy_qp(sc->io[i]); 672 } 673 free(sc->io, M_NVMF); 674 sc->io = NULL; 675 sc->num_io_queues = 0; 676 nvmf_destroy_qp(sc->admin); 677 sc->admin = NULL; 678 679 sx_xunlock(&sc->connection_lock); 680 } 681 682 static int 683 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 684 { 685 const struct nvme_controller_data *cdata; 686 nvlist_t *nvl; 687 u_int i; 688 int error; 689 690 error = nvmf_copyin_handoff(nv, &nvl); 691 if (error != 0) 692 return (error); 693 694 /* XXX: Should we permit changing the transport type? */ 695 if (sc->trtype != nvlist_get_number(nvl, "trtype")) { 696 device_printf(sc->dev, 697 "transport type mismatch on reconnect\n"); 698 return (EINVAL); 699 } 700 701 sx_xlock(&sc->connection_lock); 702 if (sc->admin != NULL || sc->detaching) { 703 error = EBUSY; 704 goto out; 705 } 706 707 /* 708 * Ensure this is for the same controller. Note that the 709 * controller ID can vary across associations if the remote 710 * system is using the dynamic controller model. This merely 711 * ensures the new association is connected to the same NVMe 712 * subsystem. 713 */ 714 cdata = nvlist_get_binary(nvl, "cdata", NULL); 715 if (memcmp(sc->cdata->subnqn, cdata->subnqn, 716 sizeof(cdata->subnqn)) != 0) { 717 device_printf(sc->dev, 718 "controller subsystem NQN mismatch on reconnect\n"); 719 error = EINVAL; 720 goto out; 721 } 722 723 /* 724 * XXX: Require same number and size of I/O queues so that 725 * max_pending_io is still correct? 726 */ 727 728 error = nvmf_establish_connection(sc, nvl); 729 if (error != 0) 730 goto out; 731 732 error = nvmf_start_aer(sc); 733 if (error != 0) 734 goto out; 735 736 device_printf(sc->dev, 737 "established new association with %u I/O queues\n", 738 sc->num_io_queues); 739 740 /* Restart namespace consumers. */ 741 for (i = 0; i < sc->cdata->nn; i++) { 742 if (sc->ns[i] != NULL) 743 nvmf_reconnect_ns(sc->ns[i]); 744 } 745 nvmf_reconnect_sim(sc); 746 747 nvmf_rescan_all_ns(sc); 748 out: 749 sx_xunlock(&sc->connection_lock); 750 nvlist_destroy(nvl); 751 return (error); 752 } 753 754 static void 755 nvmf_shutdown_pre_sync(void *arg, int howto) 756 { 757 struct nvmf_softc *sc = arg; 758 759 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 760 return; 761 762 /* 763 * If this association is disconnected, abort any pending 764 * requests with an error to permit filesystems to unmount 765 * without hanging. 766 */ 767 sx_xlock(&sc->connection_lock); 768 if (sc->admin != NULL || sc->detaching) { 769 sx_xunlock(&sc->connection_lock); 770 return; 771 } 772 773 for (u_int i = 0; i < sc->cdata->nn; i++) { 774 if (sc->ns[i] != NULL) 775 nvmf_shutdown_ns(sc->ns[i]); 776 } 777 nvmf_shutdown_sim(sc); 778 sx_xunlock(&sc->connection_lock); 779 } 780 781 static void 782 nvmf_shutdown_post_sync(void *arg, int howto) 783 { 784 struct nvmf_softc *sc = arg; 785 786 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 787 return; 788 789 /* 790 * If this association is connected, disconnect gracefully. 791 */ 792 sx_xlock(&sc->connection_lock); 793 if (sc->admin == NULL || sc->detaching) { 794 sx_xunlock(&sc->connection_lock); 795 return; 796 } 797 798 callout_drain(&sc->ka_tx_timer); 799 callout_drain(&sc->ka_rx_timer); 800 801 nvmf_shutdown_controller(sc); 802 803 /* 804 * Quiesce consumers so that any commands submitted after this 805 * fail with an error. Notably, nda(4) calls nda_flush() from 806 * a post_sync handler that might be ordered after this one. 807 */ 808 for (u_int i = 0; i < sc->cdata->nn; i++) { 809 if (sc->ns[i] != NULL) 810 nvmf_shutdown_ns(sc->ns[i]); 811 } 812 nvmf_shutdown_sim(sc); 813 814 for (u_int i = 0; i < sc->num_io_queues; i++) { 815 nvmf_destroy_qp(sc->io[i]); 816 } 817 nvmf_destroy_qp(sc->admin); 818 sc->admin = NULL; 819 sx_xunlock(&sc->connection_lock); 820 } 821 822 static int 823 nvmf_detach(device_t dev) 824 { 825 struct nvmf_softc *sc = device_get_softc(dev); 826 u_int i; 827 828 destroy_dev(sc->cdev); 829 830 sx_xlock(&sc->connection_lock); 831 sc->detaching = true; 832 sx_xunlock(&sc->connection_lock); 833 834 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); 835 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); 836 837 nvmf_destroy_sim(sc); 838 for (i = 0; i < sc->cdata->nn; i++) { 839 if (sc->ns[i] != NULL) 840 nvmf_destroy_ns(sc->ns[i]); 841 } 842 free(sc->ns, M_NVMF); 843 844 callout_drain(&sc->ka_tx_timer); 845 callout_drain(&sc->ka_rx_timer); 846 847 if (sc->admin != NULL) 848 nvmf_shutdown_controller(sc); 849 850 for (i = 0; i < sc->num_io_queues; i++) { 851 nvmf_destroy_qp(sc->io[i]); 852 } 853 free(sc->io, M_NVMF); 854 855 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 856 857 if (sc->admin != NULL) 858 nvmf_destroy_qp(sc->admin); 859 860 nvmf_destroy_aer(sc); 861 862 sx_destroy(&sc->connection_lock); 863 nvlist_destroy(sc->rparams); 864 free(sc->cdata, M_NVMF); 865 return (0); 866 } 867 868 static void 869 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, 870 const struct nvme_namespace_data *data) 871 { 872 struct nvmf_namespace *ns; 873 874 /* XXX: Needs locking around sc->ns[]. */ 875 ns = sc->ns[nsid - 1]; 876 if (data->nsze == 0) { 877 /* XXX: Needs locking */ 878 if (ns != NULL) { 879 nvmf_destroy_ns(ns); 880 sc->ns[nsid - 1] = NULL; 881 } 882 } else { 883 /* XXX: Needs locking */ 884 if (ns == NULL) { 885 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 886 } else { 887 if (!nvmf_update_ns(ns, data)) { 888 nvmf_destroy_ns(ns); 889 sc->ns[nsid - 1] = NULL; 890 } 891 } 892 } 893 894 nvmf_sim_rescan_ns(sc, nsid); 895 } 896 897 void 898 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) 899 { 900 struct nvmf_completion_status status; 901 struct nvme_namespace_data *data; 902 903 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 904 905 nvmf_status_init(&status); 906 nvmf_status_wait_io(&status); 907 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 908 &status, nvmf_io_complete, &status, M_WAITOK)) { 909 device_printf(sc->dev, 910 "failed to send IDENTIFY namespace %u command\n", nsid); 911 free(data, M_NVMF); 912 return; 913 } 914 nvmf_wait_for_reply(&status); 915 916 if (status.cqe.status != 0) { 917 device_printf(sc->dev, 918 "IDENTIFY namespace %u failed, status %#x\n", nsid, 919 le16toh(status.cqe.status)); 920 free(data, M_NVMF); 921 return; 922 } 923 924 if (status.io_error != 0) { 925 device_printf(sc->dev, 926 "IDENTIFY namespace %u failed with I/O error %d\n", 927 nsid, status.io_error); 928 free(data, M_NVMF); 929 return; 930 } 931 932 nvme_namespace_data_swapbytes(data); 933 934 nvmf_rescan_ns_1(sc, nsid, data); 935 936 free(data, M_NVMF); 937 } 938 939 static void 940 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, 941 uint32_t next_valid_nsid) 942 { 943 struct nvmf_namespace *ns; 944 945 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) 946 { 947 /* XXX: Needs locking around sc->ns[]. */ 948 ns = sc->ns[nsid - 1]; 949 if (ns != NULL) { 950 nvmf_destroy_ns(ns); 951 sc->ns[nsid - 1] = NULL; 952 953 nvmf_sim_rescan_ns(sc, nsid); 954 } 955 } 956 } 957 958 static bool 959 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, 960 const struct nvme_namespace_data *data, void *arg) 961 { 962 uint32_t *last_nsid = arg; 963 964 /* Check for any gaps prior to this namespace. */ 965 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); 966 *last_nsid = nsid; 967 968 nvmf_rescan_ns_1(sc, nsid, data); 969 return (true); 970 } 971 972 void 973 nvmf_rescan_all_ns(struct nvmf_softc *sc) 974 { 975 uint32_t last_nsid; 976 977 last_nsid = 0; 978 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) 979 return; 980 981 /* 982 * Check for any namespace devices after the last active 983 * namespace. 984 */ 985 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); 986 } 987 988 int 989 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, 990 bool admin) 991 { 992 struct nvmf_completion_status status; 993 struct nvme_command cmd; 994 struct memdesc mem; 995 struct nvmf_host_qpair *qp; 996 struct nvmf_request *req; 997 void *buf; 998 int error; 999 1000 if (pt->len > sc->max_xfer_size) 1001 return (EINVAL); 1002 1003 buf = NULL; 1004 if (pt->len != 0) { 1005 /* 1006 * XXX: Depending on the size we may want to pin the 1007 * user pages and use a memdesc with vm_page_t's 1008 * instead. 1009 */ 1010 buf = malloc(pt->len, M_NVMF, M_WAITOK); 1011 if (pt->is_read == 0) { 1012 error = copyin(pt->buf, buf, pt->len); 1013 if (error != 0) { 1014 free(buf, M_NVMF); 1015 return (error); 1016 } 1017 } else { 1018 /* Ensure no kernel data is leaked to userland. */ 1019 memset(buf, 0, pt->len); 1020 } 1021 } 1022 1023 memset(&cmd, 0, sizeof(cmd)); 1024 cmd.opc = pt->cmd.opc; 1025 cmd.fuse = pt->cmd.fuse; 1026 cmd.nsid = pt->cmd.nsid; 1027 cmd.cdw10 = pt->cmd.cdw10; 1028 cmd.cdw11 = pt->cmd.cdw11; 1029 cmd.cdw12 = pt->cmd.cdw12; 1030 cmd.cdw13 = pt->cmd.cdw13; 1031 cmd.cdw14 = pt->cmd.cdw14; 1032 cmd.cdw15 = pt->cmd.cdw15; 1033 1034 sx_slock(&sc->connection_lock); 1035 if (sc->admin == NULL || sc->detaching) { 1036 device_printf(sc->dev, 1037 "failed to send passthrough command\n"); 1038 error = ECONNABORTED; 1039 sx_sunlock(&sc->connection_lock); 1040 goto error; 1041 } 1042 if (admin) 1043 qp = sc->admin; 1044 else 1045 qp = nvmf_select_io_queue(sc); 1046 nvmf_status_init(&status); 1047 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); 1048 sx_sunlock(&sc->connection_lock); 1049 if (req == NULL) { 1050 device_printf(sc->dev, "failed to send passthrough command\n"); 1051 error = ECONNABORTED; 1052 goto error; 1053 } 1054 1055 if (pt->len != 0) { 1056 mem = memdesc_vaddr(buf, pt->len); 1057 nvmf_capsule_append_data(req->nc, &mem, pt->len, 1058 pt->is_read == 0, nvmf_io_complete, &status); 1059 nvmf_status_wait_io(&status); 1060 } 1061 1062 nvmf_submit_request(req); 1063 nvmf_wait_for_reply(&status); 1064 1065 memset(&pt->cpl, 0, sizeof(pt->cpl)); 1066 pt->cpl.cdw0 = status.cqe.cdw0; 1067 pt->cpl.status = status.cqe.status; 1068 1069 error = status.io_error; 1070 if (error == 0 && pt->len != 0 && pt->is_read != 0) 1071 error = copyout(buf, pt->buf, pt->len); 1072 error: 1073 free(buf, M_NVMF); 1074 return (error); 1075 } 1076 1077 static int 1078 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1079 { 1080 int error; 1081 1082 sx_slock(&sc->connection_lock); 1083 error = nvmf_pack_ioc_nvlist(sc->rparams, nv); 1084 sx_sunlock(&sc->connection_lock); 1085 1086 return (error); 1087 } 1088 1089 static int 1090 nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1091 { 1092 nvlist_t *nvl, *nvl_ts; 1093 int error; 1094 1095 nvl = nvlist_create(0); 1096 nvl_ts = nvlist_create(0); 1097 1098 sx_slock(&sc->connection_lock); 1099 nvlist_add_bool(nvl, "connected", sc->admin != NULL); 1100 nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec); 1101 nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec); 1102 sx_sunlock(&sc->connection_lock); 1103 nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts); 1104 1105 error = nvmf_pack_ioc_nvlist(nvl, nv); 1106 nvlist_destroy(nvl); 1107 return (error); 1108 } 1109 1110 static int 1111 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1112 struct thread *td) 1113 { 1114 struct nvmf_softc *sc = cdev->si_drv1; 1115 struct nvme_get_nsid *gnsid; 1116 struct nvme_pt_command *pt; 1117 struct nvmf_ioc_nv *nv; 1118 1119 switch (cmd) { 1120 case NVME_PASSTHROUGH_CMD: 1121 pt = (struct nvme_pt_command *)arg; 1122 return (nvmf_passthrough_cmd(sc, pt, true)); 1123 case NVME_GET_NSID: 1124 gnsid = (struct nvme_get_nsid *)arg; 1125 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), 1126 sizeof(gnsid->cdev)); 1127 gnsid->nsid = 0; 1128 return (0); 1129 case NVME_GET_MAX_XFER_SIZE: 1130 *(uint64_t *)arg = sc->max_xfer_size; 1131 return (0); 1132 case NVME_GET_CONTROLLER_DATA: 1133 memcpy(arg, sc->cdata, sizeof(*sc->cdata)); 1134 return (0); 1135 case NVMF_RECONNECT_PARAMS: 1136 nv = (struct nvmf_ioc_nv *)arg; 1137 return (nvmf_reconnect_params(sc, nv)); 1138 case NVMF_RECONNECT_HOST: 1139 nv = (struct nvmf_ioc_nv *)arg; 1140 return (nvmf_reconnect_host(sc, nv)); 1141 case NVMF_CONNECTION_STATUS: 1142 nv = (struct nvmf_ioc_nv *)arg; 1143 return (nvmf_connection_status(sc, nv)); 1144 default: 1145 return (ENOTTY); 1146 } 1147 } 1148 1149 static struct cdevsw nvmf_cdevsw = { 1150 .d_version = D_VERSION, 1151 .d_ioctl = nvmf_ioctl 1152 }; 1153 1154 static int 1155 nvmf_modevent(module_t mod, int what, void *arg) 1156 { 1157 switch (what) { 1158 case MOD_LOAD: 1159 return (nvmf_ctl_load()); 1160 case MOD_QUIESCE: 1161 return (0); 1162 case MOD_UNLOAD: 1163 nvmf_ctl_unload(); 1164 destroy_dev_drain(&nvmf_cdevsw); 1165 return (0); 1166 default: 1167 return (EOPNOTSUPP); 1168 } 1169 } 1170 1171 static device_method_t nvmf_methods[] = { 1172 /* Device interface */ 1173 DEVMETHOD(device_probe, nvmf_probe), 1174 DEVMETHOD(device_attach, nvmf_attach), 1175 DEVMETHOD(device_detach, nvmf_detach), 1176 DEVMETHOD_END 1177 }; 1178 1179 driver_t nvme_nvmf_driver = { 1180 "nvme", 1181 nvmf_methods, 1182 sizeof(struct nvmf_softc), 1183 }; 1184 1185 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); 1186 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); 1187