1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bus.h> 10 #include <sys/conf.h> 11 #include <sys/dnv.h> 12 #include <sys/eventhandler.h> 13 #include <sys/lock.h> 14 #include <sys/kernel.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/module.h> 18 #include <sys/mutex.h> 19 #include <sys/nv.h> 20 #include <sys/reboot.h> 21 #include <sys/sx.h> 22 #include <sys/sysctl.h> 23 #include <sys/taskqueue.h> 24 #include <dev/nvme/nvme.h> 25 #include <dev/nvmf/nvmf.h> 26 #include <dev/nvmf/nvmf_transport.h> 27 #include <dev/nvmf/host/nvmf_var.h> 28 29 static struct cdevsw nvmf_cdevsw; 30 static struct taskqueue *nvmf_tq; 31 32 bool nvmf_fail_disconnect = false; 33 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, 34 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); 35 36 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); 37 38 static void nvmf_controller_loss_task(void *arg, int pending); 39 static void nvmf_disconnect_task(void *arg, int pending); 40 static void nvmf_request_reconnect(struct nvmf_softc *sc); 41 static void nvmf_request_reconnect_task(void *arg, int pending); 42 static void nvmf_shutdown_pre_sync(void *arg, int howto); 43 static void nvmf_shutdown_post_sync(void *arg, int howto); 44 45 void 46 nvmf_complete(void *arg, const struct nvme_completion *cqe) 47 { 48 struct nvmf_completion_status *status = arg; 49 struct mtx *mtx; 50 51 status->cqe = *cqe; 52 mtx = mtx_pool_find(mtxpool_sleep, status); 53 mtx_lock(mtx); 54 status->done = true; 55 mtx_unlock(mtx); 56 wakeup(status); 57 } 58 59 void 60 nvmf_io_complete(void *arg, size_t xfered, int error) 61 { 62 struct nvmf_completion_status *status = arg; 63 struct mtx *mtx; 64 65 status->io_error = error; 66 mtx = mtx_pool_find(mtxpool_sleep, status); 67 mtx_lock(mtx); 68 status->io_done = true; 69 mtx_unlock(mtx); 70 wakeup(status); 71 } 72 73 void 74 nvmf_wait_for_reply(struct nvmf_completion_status *status) 75 { 76 struct mtx *mtx; 77 78 mtx = mtx_pool_find(mtxpool_sleep, status); 79 mtx_lock(mtx); 80 while (!status->done || !status->io_done) 81 mtx_sleep(status, mtx, 0, "nvmfcmd", 0); 82 mtx_unlock(mtx); 83 } 84 85 static int 86 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 87 uint64_t *value) 88 { 89 const struct nvmf_fabric_prop_get_rsp *rsp; 90 struct nvmf_completion_status status; 91 92 nvmf_status_init(&status); 93 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, 94 M_WAITOK)) 95 return (ECONNABORTED); 96 nvmf_wait_for_reply(&status); 97 98 if (status.cqe.status != 0) { 99 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", 100 le16toh(status.cqe.status)); 101 return (EIO); 102 } 103 104 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; 105 if (size == 8) 106 *value = le64toh(rsp->value.u64); 107 else 108 *value = le32toh(rsp->value.u32.low); 109 return (0); 110 } 111 112 static int 113 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 114 uint64_t value) 115 { 116 struct nvmf_completion_status status; 117 118 nvmf_status_init(&status); 119 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, 120 M_WAITOK)) 121 return (ECONNABORTED); 122 nvmf_wait_for_reply(&status); 123 124 if (status.cqe.status != 0) { 125 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", 126 le16toh(status.cqe.status)); 127 return (EIO); 128 } 129 return (0); 130 } 131 132 static void 133 nvmf_shutdown_controller(struct nvmf_softc *sc) 134 { 135 uint64_t cc; 136 int error; 137 138 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); 139 if (error != 0) { 140 device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); 141 return; 142 } 143 144 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); 145 146 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); 147 if (error != 0) 148 device_printf(sc->dev, 149 "Failed to set CC to trigger shutdown\n"); 150 } 151 152 static void 153 nvmf_check_keep_alive(void *arg) 154 { 155 struct nvmf_softc *sc = arg; 156 int traffic; 157 158 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); 159 if (traffic == 0) { 160 device_printf(sc->dev, 161 "disconnecting due to KeepAlive timeout\n"); 162 nvmf_disconnect(sc); 163 return; 164 } 165 166 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); 167 } 168 169 static void 170 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) 171 { 172 struct nvmf_softc *sc = arg; 173 174 atomic_store_int(&sc->ka_active_rx_traffic, 1); 175 if (cqe->status != 0) { 176 device_printf(sc->dev, 177 "KeepAlive response reported status %#x\n", 178 le16toh(cqe->status)); 179 } 180 } 181 182 static void 183 nvmf_send_keep_alive(void *arg) 184 { 185 struct nvmf_softc *sc = arg; 186 int traffic; 187 188 /* 189 * Don't bother sending a KeepAlive command if TKAS is active 190 * and another command has been sent during the interval. 191 */ 192 traffic = atomic_load_int(&sc->ka_active_tx_traffic); 193 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, 194 sc, M_NOWAIT)) 195 device_printf(sc->dev, 196 "Failed to allocate KeepAlive command\n"); 197 198 /* Clear ka_active_tx_traffic after sending the keep alive command. */ 199 atomic_store_int(&sc->ka_active_tx_traffic, 0); 200 201 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); 202 } 203 204 int 205 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) 206 { 207 const struct nvme_discovery_log_entry *dle; 208 const struct nvme_controller_data *cdata; 209 const nvlist_t *const *io; 210 const nvlist_t *admin, *rparams; 211 nvlist_t *nvl; 212 size_t i, num_io_queues; 213 uint32_t qsize; 214 int error; 215 216 error = nvmf_unpack_ioc_nvlist(nv, &nvl); 217 if (error != 0) 218 return (error); 219 220 if (!nvlist_exists_number(nvl, "trtype") || 221 !nvlist_exists_nvlist(nvl, "admin") || 222 !nvlist_exists_nvlist_array(nvl, "io") || 223 !nvlist_exists_binary(nvl, "cdata") || 224 !nvlist_exists_nvlist(nvl, "rparams")) 225 goto invalid; 226 227 rparams = nvlist_get_nvlist(nvl, "rparams"); 228 if (!nvlist_exists_binary(rparams, "dle") || 229 !nvlist_exists_string(rparams, "hostnqn") || 230 !nvlist_exists_number(rparams, "num_io_queues") || 231 !nvlist_exists_number(rparams, "io_qsize")) 232 goto invalid; 233 234 admin = nvlist_get_nvlist(nvl, "admin"); 235 if (!nvmf_validate_qpair_nvlist(admin, false)) 236 goto invalid; 237 if (!nvlist_get_bool(admin, "admin")) 238 goto invalid; 239 240 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 241 if (num_io_queues < 1 || 242 num_io_queues != nvlist_get_number(rparams, "num_io_queues")) 243 goto invalid; 244 for (i = 0; i < num_io_queues; i++) { 245 if (!nvmf_validate_qpair_nvlist(io[i], false)) 246 goto invalid; 247 } 248 249 /* Require all I/O queues to be the same size. */ 250 qsize = nvlist_get_number(rparams, "io_qsize"); 251 for (i = 0; i < num_io_queues; i++) { 252 if (nvlist_get_number(io[i], "qsize") != qsize) 253 goto invalid; 254 } 255 256 cdata = nvlist_get_binary(nvl, "cdata", &i); 257 if (i != sizeof(*cdata)) 258 goto invalid; 259 dle = nvlist_get_binary(rparams, "dle", &i); 260 if (i != sizeof(*dle)) 261 goto invalid; 262 263 if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0) 264 goto invalid; 265 266 *nvlp = nvl; 267 return (0); 268 invalid: 269 nvlist_destroy(nvl); 270 return (EINVAL); 271 } 272 273 static int 274 nvmf_probe(device_t dev) 275 { 276 const nvlist_t *nvl = device_get_ivars(dev); 277 const struct nvme_controller_data *cdata; 278 279 if (nvl == NULL) 280 return (ENXIO); 281 282 cdata = nvlist_get_binary(nvl, "cdata", NULL); 283 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); 284 return (BUS_PROBE_DEFAULT); 285 } 286 287 static int 288 nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl) 289 { 290 const nvlist_t *const *io; 291 const nvlist_t *admin; 292 uint64_t kato; 293 size_t num_io_queues; 294 enum nvmf_trtype trtype; 295 char name[16]; 296 297 trtype = nvlist_get_number(nvl, "trtype"); 298 admin = nvlist_get_nvlist(nvl, "admin"); 299 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 300 kato = dnvlist_get_number(nvl, "kato", 0); 301 sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0); 302 sc->controller_loss_timeout = dnvlist_get_number(nvl, 303 "controller_loss_timeout", 0); 304 305 /* Setup the admin queue. */ 306 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); 307 if (sc->admin == NULL) { 308 device_printf(sc->dev, "Failed to setup admin queue\n"); 309 return (ENXIO); 310 } 311 312 /* Setup I/O queues. */ 313 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, 314 M_WAITOK | M_ZERO); 315 sc->num_io_queues = num_io_queues; 316 for (u_int i = 0; i < sc->num_io_queues; i++) { 317 snprintf(name, sizeof(name), "I/O queue %u", i); 318 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); 319 if (sc->io[i] == NULL) { 320 device_printf(sc->dev, "Failed to setup I/O queue %u\n", 321 i); 322 return (ENXIO); 323 } 324 } 325 326 /* Start KeepAlive timers. */ 327 if (kato != 0) { 328 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, 329 sc->cdata->ctratt) != 0; 330 sc->ka_rx_sbt = mstosbt(kato); 331 sc->ka_tx_sbt = sc->ka_rx_sbt / 2; 332 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, 333 nvmf_check_keep_alive, sc, C_HARDCLOCK); 334 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, 335 nvmf_send_keep_alive, sc, C_HARDCLOCK); 336 } 337 338 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), 339 sizeof(*sc->cdata)); 340 341 /* Save reconnect parameters. */ 342 nvlist_destroy(sc->rparams); 343 sc->rparams = nvlist_take_nvlist(nvl, "rparams"); 344 345 return (0); 346 } 347 348 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, 349 const struct nvme_namespace_data *, void *); 350 351 static bool 352 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, 353 struct nvme_namespace_data *data, uint32_t *nsidp, 354 nvmf_scan_active_ns_cb *cb, void *cb_arg) 355 { 356 struct nvmf_completion_status status; 357 uint32_t nsid; 358 359 nvmf_status_init(&status); 360 nvmf_status_wait_io(&status); 361 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, 362 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { 363 device_printf(sc->dev, 364 "failed to send IDENTIFY active namespaces command\n"); 365 return (false); 366 } 367 nvmf_wait_for_reply(&status); 368 369 if (status.cqe.status != 0) { 370 device_printf(sc->dev, 371 "IDENTIFY active namespaces failed, status %#x\n", 372 le16toh(status.cqe.status)); 373 return (false); 374 } 375 376 if (status.io_error != 0) { 377 device_printf(sc->dev, 378 "IDENTIFY active namespaces failed with I/O error %d\n", 379 status.io_error); 380 return (false); 381 } 382 383 for (u_int i = 0; i < nitems(nslist->ns); i++) { 384 nsid = nslist->ns[i]; 385 if (nsid == 0) { 386 *nsidp = 0; 387 return (true); 388 } 389 390 nvmf_status_init(&status); 391 nvmf_status_wait_io(&status); 392 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 393 &status, nvmf_io_complete, &status, M_WAITOK)) { 394 device_printf(sc->dev, 395 "failed to send IDENTIFY namespace %u command\n", 396 nsid); 397 return (false); 398 } 399 nvmf_wait_for_reply(&status); 400 401 if (status.cqe.status != 0) { 402 device_printf(sc->dev, 403 "IDENTIFY namespace %u failed, status %#x\n", nsid, 404 le16toh(status.cqe.status)); 405 return (false); 406 } 407 408 if (status.io_error != 0) { 409 device_printf(sc->dev, 410 "IDENTIFY namespace %u failed with I/O error %d\n", 411 nsid, status.io_error); 412 return (false); 413 } 414 415 nvme_namespace_data_swapbytes(data); 416 if (!cb(sc, nsid, data, cb_arg)) 417 return (false); 418 } 419 420 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); 421 422 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) 423 *nsidp = 0; 424 else 425 *nsidp = nsid; 426 return (true); 427 } 428 429 static bool 430 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, 431 void *cb_arg) 432 { 433 struct nvme_namespace_data *data; 434 struct nvme_ns_list *nslist; 435 uint32_t nsid; 436 bool retval; 437 438 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); 439 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 440 441 nsid = 0; 442 retval = true; 443 for (;;) { 444 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, 445 cb_arg)) { 446 retval = false; 447 break; 448 } 449 if (nsid == 0) 450 break; 451 } 452 453 free(data, M_NVMF); 454 free(nslist, M_NVMF); 455 return (retval); 456 } 457 458 static bool 459 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, 460 const struct nvme_namespace_data *data, void *arg __unused) 461 { 462 if (sc->ns[nsid - 1] != NULL) { 463 device_printf(sc->dev, 464 "duplicate namespace %u in active namespace list\n", 465 nsid); 466 return (false); 467 } 468 469 /* 470 * As in nvme_ns_construct, a size of zero indicates an 471 * invalid namespace. 472 */ 473 if (data->nsze == 0) { 474 device_printf(sc->dev, 475 "ignoring active namespace %u with zero size\n", nsid); 476 return (true); 477 } 478 479 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 480 481 nvmf_sim_rescan_ns(sc, nsid); 482 return (true); 483 } 484 485 static bool 486 nvmf_add_namespaces(struct nvmf_softc *sc) 487 { 488 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, 489 M_WAITOK | M_ZERO); 490 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); 491 } 492 493 static int 494 nvmf_attach(device_t dev) 495 { 496 struct make_dev_args mda; 497 struct nvmf_softc *sc = device_get_softc(dev); 498 nvlist_t *nvl = device_get_ivars(dev); 499 const nvlist_t * const *io; 500 struct sysctl_oid *oid; 501 uint64_t val; 502 u_int i; 503 int error; 504 505 if (nvl == NULL) 506 return (ENXIO); 507 508 sc->dev = dev; 509 sc->trtype = nvlist_get_number(nvl, "trtype"); 510 callout_init(&sc->ka_rx_timer, 1); 511 callout_init(&sc->ka_tx_timer, 1); 512 sx_init(&sc->connection_lock, "nvmf connection"); 513 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); 514 TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0, 515 nvmf_controller_loss_task, sc); 516 TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0, 517 nvmf_request_reconnect_task, sc); 518 519 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), 520 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", 521 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); 522 sc->ioq_oid_list = SYSCTL_CHILDREN(oid); 523 524 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); 525 526 nvmf_init_aer(sc); 527 528 error = nvmf_establish_connection(sc, nvl); 529 if (error != 0) 530 goto out; 531 532 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); 533 if (error != 0) { 534 device_printf(sc->dev, "Failed to fetch CAP\n"); 535 error = ENXIO; 536 goto out; 537 } 538 539 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); 540 if (error != 0) { 541 device_printf(sc->dev, "Failed to fetch VS\n"); 542 error = ENXIO; 543 goto out; 544 } 545 sc->vs = val; 546 547 /* Honor MDTS if it is set. */ 548 sc->max_xfer_size = maxphys; 549 if (sc->cdata->mdts != 0) { 550 sc->max_xfer_size = ulmin(sc->max_xfer_size, 551 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + 552 NVME_CAP_HI_MPSMIN(sc->cap >> 32))); 553 } 554 555 io = nvlist_get_nvlist_array(nvl, "io", NULL); 556 sc->max_pending_io = nvlist_get_number(io[0], "qsize") * 557 sc->num_io_queues; 558 559 error = nvmf_init_sim(sc); 560 if (error != 0) 561 goto out; 562 563 error = nvmf_start_aer(sc); 564 if (error != 0) { 565 nvmf_destroy_sim(sc); 566 goto out; 567 } 568 569 if (!nvmf_add_namespaces(sc)) { 570 nvmf_destroy_sim(sc); 571 goto out; 572 } 573 574 make_dev_args_init(&mda); 575 mda.mda_devsw = &nvmf_cdevsw; 576 mda.mda_uid = UID_ROOT; 577 mda.mda_gid = GID_WHEEL; 578 mda.mda_mode = 0600; 579 mda.mda_si_drv1 = sc; 580 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); 581 if (error != 0) { 582 nvmf_destroy_sim(sc); 583 goto out; 584 } 585 586 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, 587 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); 588 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, 589 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST); 590 591 return (0); 592 out: 593 if (sc->ns != NULL) { 594 for (i = 0; i < sc->cdata->nn; i++) { 595 if (sc->ns[i] != NULL) 596 nvmf_destroy_ns(sc->ns[i]); 597 } 598 free(sc->ns, M_NVMF); 599 } 600 601 callout_drain(&sc->ka_tx_timer); 602 callout_drain(&sc->ka_rx_timer); 603 604 if (sc->admin != NULL) 605 nvmf_shutdown_controller(sc); 606 607 for (i = 0; i < sc->num_io_queues; i++) { 608 if (sc->io[i] != NULL) 609 nvmf_destroy_qp(sc->io[i]); 610 } 611 free(sc->io, M_NVMF); 612 if (sc->admin != NULL) 613 nvmf_destroy_qp(sc->admin); 614 615 nvmf_destroy_aer(sc); 616 617 taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); 618 taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task); 619 taskqueue_drain(nvmf_tq, &sc->disconnect_task); 620 sx_destroy(&sc->connection_lock); 621 nvlist_destroy(sc->rparams); 622 free(sc->cdata, M_NVMF); 623 return (error); 624 } 625 626 void 627 nvmf_disconnect(struct nvmf_softc *sc) 628 { 629 taskqueue_enqueue(nvmf_tq, &sc->disconnect_task); 630 } 631 632 static void 633 nvmf_disconnect_task(void *arg, int pending __unused) 634 { 635 struct nvmf_softc *sc = arg; 636 u_int i; 637 638 sx_xlock(&sc->connection_lock); 639 if (sc->admin == NULL) { 640 /* 641 * Ignore transport errors if there is no active 642 * association. 643 */ 644 sx_xunlock(&sc->connection_lock); 645 return; 646 } 647 648 if (sc->detaching) { 649 if (sc->admin != NULL) { 650 /* 651 * This unsticks the detach process if a 652 * transport error occurs during detach. 653 */ 654 nvmf_shutdown_qp(sc->admin); 655 } 656 sx_xunlock(&sc->connection_lock); 657 return; 658 } 659 660 if (sc->cdev == NULL) { 661 /* 662 * Transport error occurred during attach (nvmf_add_namespaces). 663 * Shutdown the admin queue. 664 */ 665 nvmf_shutdown_qp(sc->admin); 666 sx_xunlock(&sc->connection_lock); 667 return; 668 } 669 670 nanotime(&sc->last_disconnect); 671 callout_drain(&sc->ka_tx_timer); 672 callout_drain(&sc->ka_rx_timer); 673 sc->ka_traffic = false; 674 675 /* Quiesce namespace consumers. */ 676 nvmf_disconnect_sim(sc); 677 for (i = 0; i < sc->cdata->nn; i++) { 678 if (sc->ns[i] != NULL) 679 nvmf_disconnect_ns(sc->ns[i]); 680 } 681 682 /* Shutdown the existing qpairs. */ 683 for (i = 0; i < sc->num_io_queues; i++) { 684 nvmf_destroy_qp(sc->io[i]); 685 } 686 free(sc->io, M_NVMF); 687 sc->io = NULL; 688 sc->num_io_queues = 0; 689 nvmf_destroy_qp(sc->admin); 690 sc->admin = NULL; 691 692 if (sc->reconnect_delay != 0) 693 nvmf_request_reconnect(sc); 694 if (sc->controller_loss_timeout != 0) 695 taskqueue_enqueue_timeout(nvmf_tq, 696 &sc->controller_loss_task, sc->controller_loss_timeout * 697 hz); 698 699 sx_xunlock(&sc->connection_lock); 700 } 701 702 static void 703 nvmf_controller_loss_task(void *arg, int pending) 704 { 705 struct nvmf_softc *sc = arg; 706 device_t dev; 707 int error; 708 709 bus_topo_lock(); 710 sx_xlock(&sc->connection_lock); 711 if (sc->admin != NULL || sc->detaching) { 712 /* Reconnected or already detaching. */ 713 sx_xunlock(&sc->connection_lock); 714 bus_topo_unlock(); 715 return; 716 } 717 718 sc->controller_timedout = true; 719 sx_xunlock(&sc->connection_lock); 720 721 /* 722 * XXX: Doing this from here is a bit ugly. We don't have an 723 * extra reference on `dev` but bus_topo_lock should block any 724 * concurrent device_delete_child invocations. 725 */ 726 dev = sc->dev; 727 error = device_delete_child(root_bus, dev); 728 if (error != 0) 729 device_printf(dev, 730 "failed to detach after controller loss: %d\n", error); 731 bus_topo_unlock(); 732 } 733 734 static void 735 nvmf_request_reconnect(struct nvmf_softc *sc) 736 { 737 char buf[64]; 738 739 sx_assert(&sc->connection_lock, SX_LOCKED); 740 741 snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev)); 742 devctl_notify("nvme", "controller", "RECONNECT", buf); 743 taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task, 744 sc->reconnect_delay * hz); 745 } 746 747 static void 748 nvmf_request_reconnect_task(void *arg, int pending) 749 { 750 struct nvmf_softc *sc = arg; 751 752 sx_xlock(&sc->connection_lock); 753 if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { 754 /* Reconnected or already detaching. */ 755 sx_xunlock(&sc->connection_lock); 756 return; 757 } 758 759 nvmf_request_reconnect(sc); 760 sx_xunlock(&sc->connection_lock); 761 } 762 763 static int 764 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 765 { 766 const struct nvme_controller_data *cdata; 767 nvlist_t *nvl; 768 u_int i; 769 int error; 770 771 error = nvmf_copyin_handoff(nv, &nvl); 772 if (error != 0) 773 return (error); 774 775 /* XXX: Should we permit changing the transport type? */ 776 if (sc->trtype != nvlist_get_number(nvl, "trtype")) { 777 device_printf(sc->dev, 778 "transport type mismatch on reconnect\n"); 779 return (EINVAL); 780 } 781 782 sx_xlock(&sc->connection_lock); 783 if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { 784 error = EBUSY; 785 goto out; 786 } 787 788 /* 789 * Ensure this is for the same controller. Note that the 790 * controller ID can vary across associations if the remote 791 * system is using the dynamic controller model. This merely 792 * ensures the new association is connected to the same NVMe 793 * subsystem. 794 */ 795 cdata = nvlist_get_binary(nvl, "cdata", NULL); 796 if (memcmp(sc->cdata->subnqn, cdata->subnqn, 797 sizeof(cdata->subnqn)) != 0) { 798 device_printf(sc->dev, 799 "controller subsystem NQN mismatch on reconnect\n"); 800 error = EINVAL; 801 goto out; 802 } 803 804 /* 805 * XXX: Require same number and size of I/O queues so that 806 * max_pending_io is still correct? 807 */ 808 809 error = nvmf_establish_connection(sc, nvl); 810 if (error != 0) 811 goto out; 812 813 error = nvmf_start_aer(sc); 814 if (error != 0) 815 goto out; 816 817 device_printf(sc->dev, 818 "established new association with %u I/O queues\n", 819 sc->num_io_queues); 820 821 /* Restart namespace consumers. */ 822 for (i = 0; i < sc->cdata->nn; i++) { 823 if (sc->ns[i] != NULL) 824 nvmf_reconnect_ns(sc->ns[i]); 825 } 826 nvmf_reconnect_sim(sc); 827 828 nvmf_rescan_all_ns(sc); 829 830 taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL); 831 taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL); 832 out: 833 sx_xunlock(&sc->connection_lock); 834 nvlist_destroy(nvl); 835 return (error); 836 } 837 838 static void 839 nvmf_shutdown_pre_sync(void *arg, int howto) 840 { 841 struct nvmf_softc *sc = arg; 842 843 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 844 return; 845 846 /* 847 * If this association is disconnected, abort any pending 848 * requests with an error to permit filesystems to unmount 849 * without hanging. 850 */ 851 sx_xlock(&sc->connection_lock); 852 if (sc->admin != NULL || sc->detaching) { 853 sx_xunlock(&sc->connection_lock); 854 return; 855 } 856 857 for (u_int i = 0; i < sc->cdata->nn; i++) { 858 if (sc->ns[i] != NULL) 859 nvmf_shutdown_ns(sc->ns[i]); 860 } 861 nvmf_shutdown_sim(sc); 862 sx_xunlock(&sc->connection_lock); 863 } 864 865 static void 866 nvmf_shutdown_post_sync(void *arg, int howto) 867 { 868 struct nvmf_softc *sc = arg; 869 870 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 871 return; 872 873 /* 874 * If this association is connected, disconnect gracefully. 875 */ 876 sx_xlock(&sc->connection_lock); 877 if (sc->admin == NULL || sc->detaching) { 878 sx_xunlock(&sc->connection_lock); 879 return; 880 } 881 882 callout_drain(&sc->ka_tx_timer); 883 callout_drain(&sc->ka_rx_timer); 884 885 nvmf_shutdown_controller(sc); 886 887 /* 888 * Quiesce consumers so that any commands submitted after this 889 * fail with an error. Notably, nda(4) calls nda_flush() from 890 * a post_sync handler that might be ordered after this one. 891 */ 892 for (u_int i = 0; i < sc->cdata->nn; i++) { 893 if (sc->ns[i] != NULL) 894 nvmf_shutdown_ns(sc->ns[i]); 895 } 896 nvmf_shutdown_sim(sc); 897 898 for (u_int i = 0; i < sc->num_io_queues; i++) { 899 nvmf_destroy_qp(sc->io[i]); 900 } 901 nvmf_destroy_qp(sc->admin); 902 sc->admin = NULL; 903 sx_xunlock(&sc->connection_lock); 904 } 905 906 static int 907 nvmf_detach(device_t dev) 908 { 909 struct nvmf_softc *sc = device_get_softc(dev); 910 u_int i; 911 912 destroy_dev(sc->cdev); 913 914 sx_xlock(&sc->connection_lock); 915 sc->detaching = true; 916 sx_xunlock(&sc->connection_lock); 917 918 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); 919 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); 920 921 nvmf_destroy_sim(sc); 922 for (i = 0; i < sc->cdata->nn; i++) { 923 if (sc->ns[i] != NULL) 924 nvmf_destroy_ns(sc->ns[i]); 925 } 926 free(sc->ns, M_NVMF); 927 928 callout_drain(&sc->ka_tx_timer); 929 callout_drain(&sc->ka_rx_timer); 930 931 if (sc->admin != NULL) 932 nvmf_shutdown_controller(sc); 933 934 for (i = 0; i < sc->num_io_queues; i++) { 935 nvmf_destroy_qp(sc->io[i]); 936 } 937 free(sc->io, M_NVMF); 938 939 taskqueue_drain(nvmf_tq, &sc->disconnect_task); 940 if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, 941 NULL) != 0) 942 taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); 943 944 /* 945 * Don't cancel/drain the controller loss task if that task 946 * has fired and is triggering the detach. 947 */ 948 if (!sc->controller_timedout) { 949 if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, 950 NULL) != 0) 951 taskqueue_drain_timeout(nvmf_tq, 952 &sc->controller_loss_task); 953 } 954 955 if (sc->admin != NULL) 956 nvmf_destroy_qp(sc->admin); 957 958 nvmf_destroy_aer(sc); 959 960 sx_destroy(&sc->connection_lock); 961 nvlist_destroy(sc->rparams); 962 free(sc->cdata, M_NVMF); 963 return (0); 964 } 965 966 static void 967 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, 968 const struct nvme_namespace_data *data) 969 { 970 struct nvmf_namespace *ns; 971 972 /* XXX: Needs locking around sc->ns[]. */ 973 ns = sc->ns[nsid - 1]; 974 if (data->nsze == 0) { 975 /* XXX: Needs locking */ 976 if (ns != NULL) { 977 nvmf_destroy_ns(ns); 978 sc->ns[nsid - 1] = NULL; 979 } 980 } else { 981 /* XXX: Needs locking */ 982 if (ns == NULL) { 983 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 984 } else { 985 if (!nvmf_update_ns(ns, data)) { 986 nvmf_destroy_ns(ns); 987 sc->ns[nsid - 1] = NULL; 988 } 989 } 990 } 991 992 nvmf_sim_rescan_ns(sc, nsid); 993 } 994 995 void 996 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) 997 { 998 struct nvmf_completion_status status; 999 struct nvme_namespace_data *data; 1000 1001 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 1002 1003 nvmf_status_init(&status); 1004 nvmf_status_wait_io(&status); 1005 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 1006 &status, nvmf_io_complete, &status, M_WAITOK)) { 1007 device_printf(sc->dev, 1008 "failed to send IDENTIFY namespace %u command\n", nsid); 1009 free(data, M_NVMF); 1010 return; 1011 } 1012 nvmf_wait_for_reply(&status); 1013 1014 if (status.cqe.status != 0) { 1015 device_printf(sc->dev, 1016 "IDENTIFY namespace %u failed, status %#x\n", nsid, 1017 le16toh(status.cqe.status)); 1018 free(data, M_NVMF); 1019 return; 1020 } 1021 1022 if (status.io_error != 0) { 1023 device_printf(sc->dev, 1024 "IDENTIFY namespace %u failed with I/O error %d\n", 1025 nsid, status.io_error); 1026 free(data, M_NVMF); 1027 return; 1028 } 1029 1030 nvme_namespace_data_swapbytes(data); 1031 1032 nvmf_rescan_ns_1(sc, nsid, data); 1033 1034 free(data, M_NVMF); 1035 } 1036 1037 static void 1038 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, 1039 uint32_t next_valid_nsid) 1040 { 1041 struct nvmf_namespace *ns; 1042 1043 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) 1044 { 1045 /* XXX: Needs locking around sc->ns[]. */ 1046 ns = sc->ns[nsid - 1]; 1047 if (ns != NULL) { 1048 nvmf_destroy_ns(ns); 1049 sc->ns[nsid - 1] = NULL; 1050 1051 nvmf_sim_rescan_ns(sc, nsid); 1052 } 1053 } 1054 } 1055 1056 static bool 1057 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, 1058 const struct nvme_namespace_data *data, void *arg) 1059 { 1060 uint32_t *last_nsid = arg; 1061 1062 /* Check for any gaps prior to this namespace. */ 1063 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); 1064 *last_nsid = nsid; 1065 1066 nvmf_rescan_ns_1(sc, nsid, data); 1067 return (true); 1068 } 1069 1070 void 1071 nvmf_rescan_all_ns(struct nvmf_softc *sc) 1072 { 1073 uint32_t last_nsid; 1074 1075 last_nsid = 0; 1076 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) 1077 return; 1078 1079 /* 1080 * Check for any namespace devices after the last active 1081 * namespace. 1082 */ 1083 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); 1084 } 1085 1086 int 1087 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, 1088 bool admin) 1089 { 1090 struct nvmf_completion_status status; 1091 struct nvme_command cmd; 1092 struct memdesc mem; 1093 struct nvmf_host_qpair *qp; 1094 struct nvmf_request *req; 1095 void *buf; 1096 int error; 1097 1098 if (pt->len > sc->max_xfer_size) 1099 return (EINVAL); 1100 1101 buf = NULL; 1102 if (pt->len != 0) { 1103 /* 1104 * XXX: Depending on the size we may want to pin the 1105 * user pages and use a memdesc with vm_page_t's 1106 * instead. 1107 */ 1108 buf = malloc(pt->len, M_NVMF, M_WAITOK); 1109 if (pt->is_read == 0) { 1110 error = copyin(pt->buf, buf, pt->len); 1111 if (error != 0) { 1112 free(buf, M_NVMF); 1113 return (error); 1114 } 1115 } else { 1116 /* Ensure no kernel data is leaked to userland. */ 1117 memset(buf, 0, pt->len); 1118 } 1119 } 1120 1121 memset(&cmd, 0, sizeof(cmd)); 1122 cmd.opc = pt->cmd.opc; 1123 cmd.fuse = pt->cmd.fuse; 1124 cmd.nsid = pt->cmd.nsid; 1125 cmd.cdw10 = pt->cmd.cdw10; 1126 cmd.cdw11 = pt->cmd.cdw11; 1127 cmd.cdw12 = pt->cmd.cdw12; 1128 cmd.cdw13 = pt->cmd.cdw13; 1129 cmd.cdw14 = pt->cmd.cdw14; 1130 cmd.cdw15 = pt->cmd.cdw15; 1131 1132 sx_slock(&sc->connection_lock); 1133 if (sc->admin == NULL || sc->detaching) { 1134 device_printf(sc->dev, 1135 "failed to send passthrough command\n"); 1136 error = ECONNABORTED; 1137 sx_sunlock(&sc->connection_lock); 1138 goto error; 1139 } 1140 if (admin) 1141 qp = sc->admin; 1142 else 1143 qp = nvmf_select_io_queue(sc); 1144 nvmf_status_init(&status); 1145 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); 1146 sx_sunlock(&sc->connection_lock); 1147 if (req == NULL) { 1148 device_printf(sc->dev, "failed to send passthrough command\n"); 1149 error = ECONNABORTED; 1150 goto error; 1151 } 1152 1153 if (pt->len != 0) { 1154 mem = memdesc_vaddr(buf, pt->len); 1155 nvmf_capsule_append_data(req->nc, &mem, pt->len, 1156 pt->is_read == 0, nvmf_io_complete, &status); 1157 nvmf_status_wait_io(&status); 1158 } 1159 1160 nvmf_submit_request(req); 1161 nvmf_wait_for_reply(&status); 1162 1163 memset(&pt->cpl, 0, sizeof(pt->cpl)); 1164 pt->cpl.cdw0 = status.cqe.cdw0; 1165 pt->cpl.status = status.cqe.status; 1166 1167 error = status.io_error; 1168 if (error == 0 && pt->len != 0 && pt->is_read != 0) 1169 error = copyout(buf, pt->buf, pt->len); 1170 error: 1171 free(buf, M_NVMF); 1172 return (error); 1173 } 1174 1175 static int 1176 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1177 { 1178 int error; 1179 1180 sx_slock(&sc->connection_lock); 1181 error = nvmf_pack_ioc_nvlist(sc->rparams, nv); 1182 sx_sunlock(&sc->connection_lock); 1183 1184 return (error); 1185 } 1186 1187 static int 1188 nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1189 { 1190 nvlist_t *nvl, *nvl_ts; 1191 int error; 1192 1193 nvl = nvlist_create(0); 1194 nvl_ts = nvlist_create(0); 1195 1196 sx_slock(&sc->connection_lock); 1197 nvlist_add_bool(nvl, "connected", sc->admin != NULL); 1198 nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec); 1199 nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec); 1200 sx_sunlock(&sc->connection_lock); 1201 nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts); 1202 1203 error = nvmf_pack_ioc_nvlist(nvl, nv); 1204 nvlist_destroy(nvl); 1205 return (error); 1206 } 1207 1208 static int 1209 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1210 struct thread *td) 1211 { 1212 struct nvmf_softc *sc = cdev->si_drv1; 1213 struct nvme_get_nsid *gnsid; 1214 struct nvme_pt_command *pt; 1215 struct nvmf_ioc_nv *nv; 1216 1217 switch (cmd) { 1218 case NVME_PASSTHROUGH_CMD: 1219 pt = (struct nvme_pt_command *)arg; 1220 return (nvmf_passthrough_cmd(sc, pt, true)); 1221 case NVME_GET_NSID: 1222 gnsid = (struct nvme_get_nsid *)arg; 1223 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), 1224 sizeof(gnsid->cdev)); 1225 gnsid->nsid = 0; 1226 return (0); 1227 case NVME_GET_MAX_XFER_SIZE: 1228 *(uint64_t *)arg = sc->max_xfer_size; 1229 return (0); 1230 case NVME_GET_CONTROLLER_DATA: 1231 memcpy(arg, sc->cdata, sizeof(*sc->cdata)); 1232 return (0); 1233 case NVMF_RECONNECT_PARAMS: 1234 nv = (struct nvmf_ioc_nv *)arg; 1235 return (nvmf_reconnect_params(sc, nv)); 1236 case NVMF_RECONNECT_HOST: 1237 nv = (struct nvmf_ioc_nv *)arg; 1238 return (nvmf_reconnect_host(sc, nv)); 1239 case NVMF_CONNECTION_STATUS: 1240 nv = (struct nvmf_ioc_nv *)arg; 1241 return (nvmf_connection_status(sc, nv)); 1242 default: 1243 return (ENOTTY); 1244 } 1245 } 1246 1247 static struct cdevsw nvmf_cdevsw = { 1248 .d_version = D_VERSION, 1249 .d_ioctl = nvmf_ioctl 1250 }; 1251 1252 static int 1253 nvmf_modevent(module_t mod, int what, void *arg) 1254 { 1255 int error; 1256 1257 switch (what) { 1258 case MOD_LOAD: 1259 error = nvmf_ctl_load(); 1260 if (error != 0) 1261 return (error); 1262 1263 nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO, 1264 taskqueue_thread_enqueue, &nvmf_tq); 1265 taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq"); 1266 return (0); 1267 case MOD_QUIESCE: 1268 return (0); 1269 case MOD_UNLOAD: 1270 nvmf_ctl_unload(); 1271 destroy_dev_drain(&nvmf_cdevsw); 1272 if (nvmf_tq != NULL) 1273 taskqueue_free(nvmf_tq); 1274 return (0); 1275 default: 1276 return (EOPNOTSUPP); 1277 } 1278 } 1279 1280 static device_method_t nvmf_methods[] = { 1281 /* Device interface */ 1282 DEVMETHOD(device_probe, nvmf_probe), 1283 DEVMETHOD(device_attach, nvmf_attach), 1284 DEVMETHOD(device_detach, nvmf_detach), 1285 DEVMETHOD_END 1286 }; 1287 1288 driver_t nvme_nvmf_driver = { 1289 "nvme", 1290 nvmf_methods, 1291 sizeof(struct nvmf_softc), 1292 }; 1293 1294 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); 1295 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); 1296