1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bus.h> 10 #include <sys/conf.h> 11 #include <sys/dnv.h> 12 #include <sys/eventhandler.h> 13 #include <sys/lock.h> 14 #include <sys/kernel.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/module.h> 18 #include <sys/mutex.h> 19 #include <sys/nv.h> 20 #include <sys/reboot.h> 21 #include <sys/sx.h> 22 #include <sys/sysctl.h> 23 #include <sys/taskqueue.h> 24 #include <dev/nvme/nvme.h> 25 #include <dev/nvmf/nvmf.h> 26 #include <dev/nvmf/nvmf_transport.h> 27 #include <dev/nvmf/host/nvmf_var.h> 28 29 static struct cdevsw nvmf_cdevsw; 30 31 bool nvmf_fail_disconnect = false; 32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, 33 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); 34 35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); 36 37 static void nvmf_disconnect_task(void *arg, int pending); 38 static void nvmf_shutdown_pre_sync(void *arg, int howto); 39 static void nvmf_shutdown_post_sync(void *arg, int howto); 40 41 void 42 nvmf_complete(void *arg, const struct nvme_completion *cqe) 43 { 44 struct nvmf_completion_status *status = arg; 45 struct mtx *mtx; 46 47 status->cqe = *cqe; 48 mtx = mtx_pool_find(mtxpool_sleep, status); 49 mtx_lock(mtx); 50 status->done = true; 51 mtx_unlock(mtx); 52 wakeup(status); 53 } 54 55 void 56 nvmf_io_complete(void *arg, size_t xfered, int error) 57 { 58 struct nvmf_completion_status *status = arg; 59 struct mtx *mtx; 60 61 status->io_error = error; 62 mtx = mtx_pool_find(mtxpool_sleep, status); 63 mtx_lock(mtx); 64 status->io_done = true; 65 mtx_unlock(mtx); 66 wakeup(status); 67 } 68 69 void 70 nvmf_wait_for_reply(struct nvmf_completion_status *status) 71 { 72 struct mtx *mtx; 73 74 mtx = mtx_pool_find(mtxpool_sleep, status); 75 mtx_lock(mtx); 76 while (!status->done || !status->io_done) 77 mtx_sleep(status, mtx, 0, "nvmfcmd", 0); 78 mtx_unlock(mtx); 79 } 80 81 static int 82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 83 uint64_t *value) 84 { 85 const struct nvmf_fabric_prop_get_rsp *rsp; 86 struct nvmf_completion_status status; 87 88 nvmf_status_init(&status); 89 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, 90 M_WAITOK)) 91 return (ECONNABORTED); 92 nvmf_wait_for_reply(&status); 93 94 if (status.cqe.status != 0) { 95 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", 96 le16toh(status.cqe.status)); 97 return (EIO); 98 } 99 100 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; 101 if (size == 8) 102 *value = le64toh(rsp->value.u64); 103 else 104 *value = le32toh(rsp->value.u32.low); 105 return (0); 106 } 107 108 static int 109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 110 uint64_t value) 111 { 112 struct nvmf_completion_status status; 113 114 nvmf_status_init(&status); 115 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, 116 M_WAITOK)) 117 return (ECONNABORTED); 118 nvmf_wait_for_reply(&status); 119 120 if (status.cqe.status != 0) { 121 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", 122 le16toh(status.cqe.status)); 123 return (EIO); 124 } 125 return (0); 126 } 127 128 static void 129 nvmf_shutdown_controller(struct nvmf_softc *sc) 130 { 131 uint64_t cc; 132 int error; 133 134 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); 135 if (error != 0) { 136 device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); 137 return; 138 } 139 140 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); 141 142 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); 143 if (error != 0) 144 device_printf(sc->dev, 145 "Failed to set CC to trigger shutdown\n"); 146 } 147 148 static void 149 nvmf_check_keep_alive(void *arg) 150 { 151 struct nvmf_softc *sc = arg; 152 int traffic; 153 154 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); 155 if (traffic == 0) { 156 device_printf(sc->dev, 157 "disconnecting due to KeepAlive timeout\n"); 158 nvmf_disconnect(sc); 159 return; 160 } 161 162 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); 163 } 164 165 static void 166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) 167 { 168 struct nvmf_softc *sc = arg; 169 170 atomic_store_int(&sc->ka_active_rx_traffic, 1); 171 if (cqe->status != 0) { 172 device_printf(sc->dev, 173 "KeepAlive response reported status %#x\n", 174 le16toh(cqe->status)); 175 } 176 } 177 178 static void 179 nvmf_send_keep_alive(void *arg) 180 { 181 struct nvmf_softc *sc = arg; 182 int traffic; 183 184 /* 185 * Don't bother sending a KeepAlive command if TKAS is active 186 * and another command has been sent during the interval. 187 */ 188 traffic = atomic_load_int(&sc->ka_active_tx_traffic); 189 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, 190 sc, M_NOWAIT)) 191 device_printf(sc->dev, 192 "Failed to allocate KeepAlive command\n"); 193 194 /* Clear ka_active_tx_traffic after sending the keep alive command. */ 195 atomic_store_int(&sc->ka_active_tx_traffic, 0); 196 197 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); 198 } 199 200 int 201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) 202 { 203 const nvlist_t *const *io; 204 const nvlist_t *admin; 205 nvlist_t *nvl; 206 size_t i, num_io_queues; 207 uint32_t qsize; 208 int error; 209 210 error = nvmf_unpack_ioc_nvlist(nv, &nvl); 211 if (error != 0) 212 return (error); 213 214 if (!nvlist_exists_number(nvl, "trtype") || 215 !nvlist_exists_nvlist(nvl, "admin") || 216 !nvlist_exists_nvlist_array(nvl, "io") || 217 !nvlist_exists_binary(nvl, "cdata")) 218 goto invalid; 219 220 admin = nvlist_get_nvlist(nvl, "admin"); 221 if (!nvmf_validate_qpair_nvlist(admin, false)) 222 goto invalid; 223 if (!nvlist_get_bool(admin, "admin")) 224 goto invalid; 225 226 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 227 if (num_io_queues < 1) 228 goto invalid; 229 for (i = 0; i < num_io_queues; i++) { 230 if (!nvmf_validate_qpair_nvlist(io[i], false)) 231 goto invalid; 232 } 233 234 /* Require all I/O queues to be the same size. */ 235 qsize = nvlist_get_number(io[0], "qsize"); 236 for (i = 1; i < num_io_queues; i++) { 237 if (nvlist_get_number(io[i], "qsize") != qsize) 238 goto invalid; 239 } 240 241 nvlist_get_binary(nvl, "cdata", &i); 242 if (i != sizeof(struct nvme_controller_data)) 243 goto invalid; 244 245 *nvlp = nvl; 246 return (0); 247 invalid: 248 nvlist_destroy(nvl); 249 return (EINVAL); 250 } 251 252 static int 253 nvmf_probe(device_t dev) 254 { 255 const nvlist_t *nvl = device_get_ivars(dev); 256 const struct nvme_controller_data *cdata; 257 258 if (nvl == NULL) 259 return (ENXIO); 260 261 cdata = nvlist_get_binary(nvl, "cdata", NULL); 262 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); 263 return (BUS_PROBE_DEFAULT); 264 } 265 266 static int 267 nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl) 268 { 269 const nvlist_t *const *io; 270 const nvlist_t *admin; 271 uint64_t kato; 272 size_t num_io_queues; 273 enum nvmf_trtype trtype; 274 char name[16]; 275 276 trtype = nvlist_get_number(nvl, "trtype"); 277 admin = nvlist_get_nvlist(nvl, "admin"); 278 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 279 kato = dnvlist_get_number(nvl, "kato", 0); 280 281 /* Setup the admin queue. */ 282 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); 283 if (sc->admin == NULL) { 284 device_printf(sc->dev, "Failed to setup admin queue\n"); 285 return (ENXIO); 286 } 287 288 /* Setup I/O queues. */ 289 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, 290 M_WAITOK | M_ZERO); 291 sc->num_io_queues = num_io_queues; 292 for (u_int i = 0; i < sc->num_io_queues; i++) { 293 snprintf(name, sizeof(name), "I/O queue %u", i); 294 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); 295 if (sc->io[i] == NULL) { 296 device_printf(sc->dev, "Failed to setup I/O queue %u\n", 297 i + 1); 298 return (ENXIO); 299 } 300 } 301 302 /* Start KeepAlive timers. */ 303 if (kato != 0) { 304 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, 305 sc->cdata->ctratt) != 0; 306 sc->ka_rx_sbt = mstosbt(kato); 307 sc->ka_tx_sbt = sc->ka_rx_sbt / 2; 308 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, 309 nvmf_check_keep_alive, sc, C_HARDCLOCK); 310 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, 311 nvmf_send_keep_alive, sc, C_HARDCLOCK); 312 } 313 314 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), 315 sizeof(*sc->cdata)); 316 317 return (0); 318 } 319 320 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, 321 const struct nvme_namespace_data *, void *); 322 323 static bool 324 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, 325 struct nvme_namespace_data *data, uint32_t *nsidp, 326 nvmf_scan_active_ns_cb *cb, void *cb_arg) 327 { 328 struct nvmf_completion_status status; 329 uint32_t nsid; 330 331 nvmf_status_init(&status); 332 nvmf_status_wait_io(&status); 333 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, 334 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { 335 device_printf(sc->dev, 336 "failed to send IDENTIFY active namespaces command\n"); 337 return (false); 338 } 339 nvmf_wait_for_reply(&status); 340 341 if (status.cqe.status != 0) { 342 device_printf(sc->dev, 343 "IDENTIFY active namespaces failed, status %#x\n", 344 le16toh(status.cqe.status)); 345 return (false); 346 } 347 348 if (status.io_error != 0) { 349 device_printf(sc->dev, 350 "IDENTIFY active namespaces failed with I/O error %d\n", 351 status.io_error); 352 return (false); 353 } 354 355 for (u_int i = 0; i < nitems(nslist->ns); i++) { 356 nsid = nslist->ns[i]; 357 if (nsid == 0) { 358 *nsidp = 0; 359 return (true); 360 } 361 362 nvmf_status_init(&status); 363 nvmf_status_wait_io(&status); 364 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 365 &status, nvmf_io_complete, &status, M_WAITOK)) { 366 device_printf(sc->dev, 367 "failed to send IDENTIFY namespace %u command\n", 368 nsid); 369 return (false); 370 } 371 nvmf_wait_for_reply(&status); 372 373 if (status.cqe.status != 0) { 374 device_printf(sc->dev, 375 "IDENTIFY namespace %u failed, status %#x\n", nsid, 376 le16toh(status.cqe.status)); 377 return (false); 378 } 379 380 if (status.io_error != 0) { 381 device_printf(sc->dev, 382 "IDENTIFY namespace %u failed with I/O error %d\n", 383 nsid, status.io_error); 384 return (false); 385 } 386 387 nvme_namespace_data_swapbytes(data); 388 if (!cb(sc, nsid, data, cb_arg)) 389 return (false); 390 } 391 392 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); 393 394 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) 395 *nsidp = 0; 396 else 397 *nsidp = nsid; 398 return (true); 399 } 400 401 static bool 402 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, 403 void *cb_arg) 404 { 405 struct nvme_namespace_data *data; 406 struct nvme_ns_list *nslist; 407 uint32_t nsid; 408 bool retval; 409 410 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); 411 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 412 413 nsid = 0; 414 retval = true; 415 for (;;) { 416 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, 417 cb_arg)) { 418 retval = false; 419 break; 420 } 421 if (nsid == 0) 422 break; 423 } 424 425 free(data, M_NVMF); 426 free(nslist, M_NVMF); 427 return (retval); 428 } 429 430 static bool 431 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, 432 const struct nvme_namespace_data *data, void *arg __unused) 433 { 434 if (sc->ns[nsid - 1] != NULL) { 435 device_printf(sc->dev, 436 "duplicate namespace %u in active namespace list\n", 437 nsid); 438 return (false); 439 } 440 441 /* 442 * As in nvme_ns_construct, a size of zero indicates an 443 * invalid namespace. 444 */ 445 if (data->nsze == 0) { 446 device_printf(sc->dev, 447 "ignoring active namespace %u with zero size\n", nsid); 448 return (true); 449 } 450 451 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 452 453 nvmf_sim_rescan_ns(sc, nsid); 454 return (true); 455 } 456 457 static bool 458 nvmf_add_namespaces(struct nvmf_softc *sc) 459 { 460 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, 461 M_WAITOK | M_ZERO); 462 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); 463 } 464 465 static int 466 nvmf_attach(device_t dev) 467 { 468 struct make_dev_args mda; 469 struct nvmf_softc *sc = device_get_softc(dev); 470 const nvlist_t *nvl = device_get_ivars(dev); 471 const nvlist_t * const *io; 472 struct sysctl_oid *oid; 473 uint64_t val; 474 u_int i; 475 int error; 476 477 if (nvl == NULL) 478 return (ENXIO); 479 480 sc->dev = dev; 481 sc->trtype = nvlist_get_number(nvl, "trtype"); 482 callout_init(&sc->ka_rx_timer, 1); 483 callout_init(&sc->ka_tx_timer, 1); 484 sx_init(&sc->connection_lock, "nvmf connection"); 485 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); 486 487 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), 488 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", 489 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); 490 sc->ioq_oid_list = SYSCTL_CHILDREN(oid); 491 492 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); 493 494 nvmf_init_aer(sc); 495 496 error = nvmf_establish_connection(sc, nvl); 497 if (error != 0) 498 goto out; 499 500 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); 501 if (error != 0) { 502 device_printf(sc->dev, "Failed to fetch CAP\n"); 503 error = ENXIO; 504 goto out; 505 } 506 507 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); 508 if (error != 0) { 509 device_printf(sc->dev, "Failed to fetch VS\n"); 510 error = ENXIO; 511 goto out; 512 } 513 sc->vs = val; 514 515 /* Honor MDTS if it is set. */ 516 sc->max_xfer_size = maxphys; 517 if (sc->cdata->mdts != 0) { 518 sc->max_xfer_size = ulmin(sc->max_xfer_size, 519 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + 520 NVME_CAP_HI_MPSMIN(sc->cap >> 32))); 521 } 522 523 io = nvlist_get_nvlist_array(nvl, "io", NULL); 524 sc->max_pending_io = nvlist_get_number(io[0], "qsize") * 525 sc->num_io_queues; 526 527 error = nvmf_init_sim(sc); 528 if (error != 0) 529 goto out; 530 531 error = nvmf_start_aer(sc); 532 if (error != 0) { 533 nvmf_destroy_sim(sc); 534 goto out; 535 } 536 537 if (!nvmf_add_namespaces(sc)) { 538 nvmf_destroy_sim(sc); 539 goto out; 540 } 541 542 make_dev_args_init(&mda); 543 mda.mda_devsw = &nvmf_cdevsw; 544 mda.mda_uid = UID_ROOT; 545 mda.mda_gid = GID_WHEEL; 546 mda.mda_mode = 0600; 547 mda.mda_si_drv1 = sc; 548 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); 549 if (error != 0) { 550 nvmf_destroy_sim(sc); 551 goto out; 552 } 553 554 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, 555 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); 556 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, 557 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST); 558 559 return (0); 560 out: 561 if (sc->ns != NULL) { 562 for (i = 0; i < sc->cdata->nn; i++) { 563 if (sc->ns[i] != NULL) 564 nvmf_destroy_ns(sc->ns[i]); 565 } 566 free(sc->ns, M_NVMF); 567 } 568 569 callout_drain(&sc->ka_tx_timer); 570 callout_drain(&sc->ka_rx_timer); 571 572 if (sc->admin != NULL) 573 nvmf_shutdown_controller(sc); 574 575 for (i = 0; i < sc->num_io_queues; i++) { 576 if (sc->io[i] != NULL) 577 nvmf_destroy_qp(sc->io[i]); 578 } 579 free(sc->io, M_NVMF); 580 if (sc->admin != NULL) 581 nvmf_destroy_qp(sc->admin); 582 583 nvmf_destroy_aer(sc); 584 585 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 586 sx_destroy(&sc->connection_lock); 587 free(sc->cdata, M_NVMF); 588 return (error); 589 } 590 591 void 592 nvmf_disconnect(struct nvmf_softc *sc) 593 { 594 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); 595 } 596 597 static void 598 nvmf_disconnect_task(void *arg, int pending __unused) 599 { 600 struct nvmf_softc *sc = arg; 601 u_int i; 602 603 sx_xlock(&sc->connection_lock); 604 if (sc->admin == NULL) { 605 /* 606 * Ignore transport errors if there is no active 607 * association. 608 */ 609 sx_xunlock(&sc->connection_lock); 610 return; 611 } 612 613 if (sc->detaching) { 614 if (sc->admin != NULL) { 615 /* 616 * This unsticks the detach process if a 617 * transport error occurs during detach. 618 */ 619 nvmf_shutdown_qp(sc->admin); 620 } 621 sx_xunlock(&sc->connection_lock); 622 return; 623 } 624 625 if (sc->cdev == NULL) { 626 /* 627 * Transport error occurred during attach (nvmf_add_namespaces). 628 * Shutdown the admin queue. 629 */ 630 nvmf_shutdown_qp(sc->admin); 631 sx_xunlock(&sc->connection_lock); 632 return; 633 } 634 635 callout_drain(&sc->ka_tx_timer); 636 callout_drain(&sc->ka_rx_timer); 637 sc->ka_traffic = false; 638 639 /* Quiesce namespace consumers. */ 640 nvmf_disconnect_sim(sc); 641 for (i = 0; i < sc->cdata->nn; i++) { 642 if (sc->ns[i] != NULL) 643 nvmf_disconnect_ns(sc->ns[i]); 644 } 645 646 /* Shutdown the existing qpairs. */ 647 for (i = 0; i < sc->num_io_queues; i++) { 648 nvmf_destroy_qp(sc->io[i]); 649 } 650 free(sc->io, M_NVMF); 651 sc->io = NULL; 652 sc->num_io_queues = 0; 653 nvmf_destroy_qp(sc->admin); 654 sc->admin = NULL; 655 656 sx_xunlock(&sc->connection_lock); 657 } 658 659 static int 660 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 661 { 662 const struct nvme_controller_data *cdata; 663 nvlist_t *nvl; 664 u_int i; 665 int error; 666 667 error = nvmf_copyin_handoff(nv, &nvl); 668 if (error != 0) 669 return (error); 670 671 /* XXX: Should we permit changing the transport type? */ 672 if (sc->trtype != nvlist_get_number(nvl, "trtype")) { 673 device_printf(sc->dev, 674 "transport type mismatch on reconnect\n"); 675 return (EINVAL); 676 } 677 678 sx_xlock(&sc->connection_lock); 679 if (sc->admin != NULL || sc->detaching) { 680 error = EBUSY; 681 goto out; 682 } 683 684 /* 685 * Ensure this is for the same controller. Note that the 686 * controller ID can vary across associations if the remote 687 * system is using the dynamic controller model. This merely 688 * ensures the new association is connected to the same NVMe 689 * subsystem. 690 */ 691 cdata = nvlist_get_binary(nvl, "cdata", NULL); 692 if (memcmp(sc->cdata->subnqn, cdata->subnqn, 693 sizeof(cdata->subnqn)) != 0) { 694 device_printf(sc->dev, 695 "controller subsystem NQN mismatch on reconnect\n"); 696 error = EINVAL; 697 goto out; 698 } 699 700 /* 701 * XXX: Require same number and size of I/O queues so that 702 * max_pending_io is still correct? 703 */ 704 705 error = nvmf_establish_connection(sc, nvl); 706 if (error != 0) 707 goto out; 708 709 error = nvmf_start_aer(sc); 710 if (error != 0) 711 goto out; 712 713 device_printf(sc->dev, 714 "established new association with %u I/O queues\n", 715 sc->num_io_queues); 716 717 /* Restart namespace consumers. */ 718 for (i = 0; i < sc->cdata->nn; i++) { 719 if (sc->ns[i] != NULL) 720 nvmf_reconnect_ns(sc->ns[i]); 721 } 722 nvmf_reconnect_sim(sc); 723 724 nvmf_rescan_all_ns(sc); 725 out: 726 sx_xunlock(&sc->connection_lock); 727 nvlist_destroy(nvl); 728 return (error); 729 } 730 731 static void 732 nvmf_shutdown_pre_sync(void *arg, int howto) 733 { 734 struct nvmf_softc *sc = arg; 735 736 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 737 return; 738 739 /* 740 * If this association is disconnected, abort any pending 741 * requests with an error to permit filesystems to unmount 742 * without hanging. 743 */ 744 sx_xlock(&sc->connection_lock); 745 if (sc->admin != NULL || sc->detaching) { 746 sx_xunlock(&sc->connection_lock); 747 return; 748 } 749 750 for (u_int i = 0; i < sc->cdata->nn; i++) { 751 if (sc->ns[i] != NULL) 752 nvmf_shutdown_ns(sc->ns[i]); 753 } 754 nvmf_shutdown_sim(sc); 755 sx_xunlock(&sc->connection_lock); 756 } 757 758 static void 759 nvmf_shutdown_post_sync(void *arg, int howto) 760 { 761 struct nvmf_softc *sc = arg; 762 763 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 764 return; 765 766 /* 767 * If this association is connected, disconnect gracefully. 768 */ 769 sx_xlock(&sc->connection_lock); 770 if (sc->admin == NULL || sc->detaching) { 771 sx_xunlock(&sc->connection_lock); 772 return; 773 } 774 775 callout_drain(&sc->ka_tx_timer); 776 callout_drain(&sc->ka_rx_timer); 777 778 nvmf_shutdown_controller(sc); 779 780 /* 781 * Quiesce consumers so that any commands submitted after this 782 * fail with an error. Notably, nda(4) calls nda_flush() from 783 * a post_sync handler that might be ordered after this one. 784 */ 785 for (u_int i = 0; i < sc->cdata->nn; i++) { 786 if (sc->ns[i] != NULL) 787 nvmf_shutdown_ns(sc->ns[i]); 788 } 789 nvmf_shutdown_sim(sc); 790 791 for (u_int i = 0; i < sc->num_io_queues; i++) { 792 nvmf_destroy_qp(sc->io[i]); 793 } 794 nvmf_destroy_qp(sc->admin); 795 sc->admin = NULL; 796 sx_xunlock(&sc->connection_lock); 797 } 798 799 static int 800 nvmf_detach(device_t dev) 801 { 802 struct nvmf_softc *sc = device_get_softc(dev); 803 u_int i; 804 805 destroy_dev(sc->cdev); 806 807 sx_xlock(&sc->connection_lock); 808 sc->detaching = true; 809 sx_xunlock(&sc->connection_lock); 810 811 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); 812 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); 813 814 nvmf_destroy_sim(sc); 815 for (i = 0; i < sc->cdata->nn; i++) { 816 if (sc->ns[i] != NULL) 817 nvmf_destroy_ns(sc->ns[i]); 818 } 819 free(sc->ns, M_NVMF); 820 821 callout_drain(&sc->ka_tx_timer); 822 callout_drain(&sc->ka_rx_timer); 823 824 if (sc->admin != NULL) 825 nvmf_shutdown_controller(sc); 826 827 for (i = 0; i < sc->num_io_queues; i++) { 828 nvmf_destroy_qp(sc->io[i]); 829 } 830 free(sc->io, M_NVMF); 831 832 taskqueue_drain(taskqueue_thread, &sc->disconnect_task); 833 834 if (sc->admin != NULL) 835 nvmf_destroy_qp(sc->admin); 836 837 nvmf_destroy_aer(sc); 838 839 sx_destroy(&sc->connection_lock); 840 free(sc->cdata, M_NVMF); 841 return (0); 842 } 843 844 static void 845 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, 846 const struct nvme_namespace_data *data) 847 { 848 struct nvmf_namespace *ns; 849 850 /* XXX: Needs locking around sc->ns[]. */ 851 ns = sc->ns[nsid - 1]; 852 if (data->nsze == 0) { 853 /* XXX: Needs locking */ 854 if (ns != NULL) { 855 nvmf_destroy_ns(ns); 856 sc->ns[nsid - 1] = NULL; 857 } 858 } else { 859 /* XXX: Needs locking */ 860 if (ns == NULL) { 861 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 862 } else { 863 if (!nvmf_update_ns(ns, data)) { 864 nvmf_destroy_ns(ns); 865 sc->ns[nsid - 1] = NULL; 866 } 867 } 868 } 869 870 nvmf_sim_rescan_ns(sc, nsid); 871 } 872 873 void 874 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) 875 { 876 struct nvmf_completion_status status; 877 struct nvme_namespace_data *data; 878 879 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 880 881 nvmf_status_init(&status); 882 nvmf_status_wait_io(&status); 883 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 884 &status, nvmf_io_complete, &status, M_WAITOK)) { 885 device_printf(sc->dev, 886 "failed to send IDENTIFY namespace %u command\n", nsid); 887 free(data, M_NVMF); 888 return; 889 } 890 nvmf_wait_for_reply(&status); 891 892 if (status.cqe.status != 0) { 893 device_printf(sc->dev, 894 "IDENTIFY namespace %u failed, status %#x\n", nsid, 895 le16toh(status.cqe.status)); 896 free(data, M_NVMF); 897 return; 898 } 899 900 if (status.io_error != 0) { 901 device_printf(sc->dev, 902 "IDENTIFY namespace %u failed with I/O error %d\n", 903 nsid, status.io_error); 904 free(data, M_NVMF); 905 return; 906 } 907 908 nvme_namespace_data_swapbytes(data); 909 910 nvmf_rescan_ns_1(sc, nsid, data); 911 912 free(data, M_NVMF); 913 } 914 915 static void 916 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, 917 uint32_t next_valid_nsid) 918 { 919 struct nvmf_namespace *ns; 920 921 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) 922 { 923 /* XXX: Needs locking around sc->ns[]. */ 924 ns = sc->ns[nsid - 1]; 925 if (ns != NULL) { 926 nvmf_destroy_ns(ns); 927 sc->ns[nsid - 1] = NULL; 928 929 nvmf_sim_rescan_ns(sc, nsid); 930 } 931 } 932 } 933 934 static bool 935 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, 936 const struct nvme_namespace_data *data, void *arg) 937 { 938 uint32_t *last_nsid = arg; 939 940 /* Check for any gaps prior to this namespace. */ 941 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); 942 *last_nsid = nsid; 943 944 nvmf_rescan_ns_1(sc, nsid, data); 945 return (true); 946 } 947 948 void 949 nvmf_rescan_all_ns(struct nvmf_softc *sc) 950 { 951 uint32_t last_nsid; 952 953 last_nsid = 0; 954 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) 955 return; 956 957 /* 958 * Check for any namespace devices after the last active 959 * namespace. 960 */ 961 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); 962 } 963 964 int 965 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, 966 bool admin) 967 { 968 struct nvmf_completion_status status; 969 struct nvme_command cmd; 970 struct memdesc mem; 971 struct nvmf_host_qpair *qp; 972 struct nvmf_request *req; 973 void *buf; 974 int error; 975 976 if (pt->len > sc->max_xfer_size) 977 return (EINVAL); 978 979 buf = NULL; 980 if (pt->len != 0) { 981 /* 982 * XXX: Depending on the size we may want to pin the 983 * user pages and use a memdesc with vm_page_t's 984 * instead. 985 */ 986 buf = malloc(pt->len, M_NVMF, M_WAITOK); 987 if (pt->is_read == 0) { 988 error = copyin(pt->buf, buf, pt->len); 989 if (error != 0) { 990 free(buf, M_NVMF); 991 return (error); 992 } 993 } else { 994 /* Ensure no kernel data is leaked to userland. */ 995 memset(buf, 0, pt->len); 996 } 997 } 998 999 memset(&cmd, 0, sizeof(cmd)); 1000 cmd.opc = pt->cmd.opc; 1001 cmd.fuse = pt->cmd.fuse; 1002 cmd.nsid = pt->cmd.nsid; 1003 cmd.cdw10 = pt->cmd.cdw10; 1004 cmd.cdw11 = pt->cmd.cdw11; 1005 cmd.cdw12 = pt->cmd.cdw12; 1006 cmd.cdw13 = pt->cmd.cdw13; 1007 cmd.cdw14 = pt->cmd.cdw14; 1008 cmd.cdw15 = pt->cmd.cdw15; 1009 1010 sx_slock(&sc->connection_lock); 1011 if (sc->admin == NULL || sc->detaching) { 1012 device_printf(sc->dev, 1013 "failed to send passthrough command\n"); 1014 error = ECONNABORTED; 1015 sx_sunlock(&sc->connection_lock); 1016 goto error; 1017 } 1018 if (admin) 1019 qp = sc->admin; 1020 else 1021 qp = nvmf_select_io_queue(sc); 1022 nvmf_status_init(&status); 1023 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); 1024 sx_sunlock(&sc->connection_lock); 1025 if (req == NULL) { 1026 device_printf(sc->dev, "failed to send passthrough command\n"); 1027 error = ECONNABORTED; 1028 goto error; 1029 } 1030 1031 if (pt->len != 0) { 1032 mem = memdesc_vaddr(buf, pt->len); 1033 nvmf_capsule_append_data(req->nc, &mem, pt->len, 1034 pt->is_read == 0, nvmf_io_complete, &status); 1035 nvmf_status_wait_io(&status); 1036 } 1037 1038 nvmf_submit_request(req); 1039 nvmf_wait_for_reply(&status); 1040 1041 memset(&pt->cpl, 0, sizeof(pt->cpl)); 1042 pt->cpl.cdw0 = status.cqe.cdw0; 1043 pt->cpl.status = status.cqe.status; 1044 1045 error = status.io_error; 1046 if (error == 0 && pt->len != 0 && pt->is_read != 0) 1047 error = copyout(buf, pt->buf, pt->len); 1048 error: 1049 free(buf, M_NVMF); 1050 return (error); 1051 } 1052 1053 static int 1054 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1055 { 1056 nvlist_t *nvl; 1057 int error; 1058 1059 nvl = nvlist_create(0); 1060 1061 sx_slock(&sc->connection_lock); 1062 if ((sc->cdata->fcatt & 1) == 0) 1063 nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC); 1064 else 1065 nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id); 1066 nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn); 1067 sx_sunlock(&sc->connection_lock); 1068 1069 error = nvmf_pack_ioc_nvlist(nvl, nv); 1070 nvlist_destroy(nvl); 1071 return (error); 1072 } 1073 1074 static int 1075 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1076 struct thread *td) 1077 { 1078 struct nvmf_softc *sc = cdev->si_drv1; 1079 struct nvme_get_nsid *gnsid; 1080 struct nvme_pt_command *pt; 1081 struct nvmf_ioc_nv *nv; 1082 1083 switch (cmd) { 1084 case NVME_PASSTHROUGH_CMD: 1085 pt = (struct nvme_pt_command *)arg; 1086 return (nvmf_passthrough_cmd(sc, pt, true)); 1087 case NVME_GET_NSID: 1088 gnsid = (struct nvme_get_nsid *)arg; 1089 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), 1090 sizeof(gnsid->cdev)); 1091 gnsid->nsid = 0; 1092 return (0); 1093 case NVME_GET_MAX_XFER_SIZE: 1094 *(uint64_t *)arg = sc->max_xfer_size; 1095 return (0); 1096 case NVMF_RECONNECT_PARAMS: 1097 nv = (struct nvmf_ioc_nv *)arg; 1098 return (nvmf_reconnect_params(sc, nv)); 1099 case NVMF_RECONNECT_HOST: 1100 nv = (struct nvmf_ioc_nv *)arg; 1101 return (nvmf_reconnect_host(sc, nv)); 1102 default: 1103 return (ENOTTY); 1104 } 1105 } 1106 1107 static struct cdevsw nvmf_cdevsw = { 1108 .d_version = D_VERSION, 1109 .d_ioctl = nvmf_ioctl 1110 }; 1111 1112 static int 1113 nvmf_modevent(module_t mod, int what, void *arg) 1114 { 1115 switch (what) { 1116 case MOD_LOAD: 1117 return (nvmf_ctl_load()); 1118 case MOD_QUIESCE: 1119 return (0); 1120 case MOD_UNLOAD: 1121 nvmf_ctl_unload(); 1122 destroy_dev_drain(&nvmf_cdevsw); 1123 return (0); 1124 default: 1125 return (EOPNOTSUPP); 1126 } 1127 } 1128 1129 static device_method_t nvmf_methods[] = { 1130 /* Device interface */ 1131 DEVMETHOD(device_probe, nvmf_probe), 1132 DEVMETHOD(device_attach, nvmf_attach), 1133 DEVMETHOD(device_detach, nvmf_detach), 1134 DEVMETHOD_END 1135 }; 1136 1137 driver_t nvme_nvmf_driver = { 1138 "nvme", 1139 nvmf_methods, 1140 sizeof(struct nvmf_softc), 1141 }; 1142 1143 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); 1144 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); 1145