1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bus.h> 10 #include <sys/conf.h> 11 #include <sys/dnv.h> 12 #include <sys/eventhandler.h> 13 #include <sys/lock.h> 14 #include <sys/kernel.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/module.h> 18 #include <sys/mutex.h> 19 #include <sys/nv.h> 20 #include <sys/reboot.h> 21 #include <sys/sx.h> 22 #include <sys/sysctl.h> 23 #include <sys/taskqueue.h> 24 #include <dev/nvme/nvme.h> 25 #include <dev/nvmf/nvmf.h> 26 #include <dev/nvmf/nvmf_transport.h> 27 #include <dev/nvmf/host/nvmf_var.h> 28 29 static struct cdevsw nvmf_cdevsw; 30 static struct taskqueue *nvmf_tq; 31 32 bool nvmf_fail_disconnect = false; 33 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, 34 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); 35 36 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); 37 38 static void nvmf_controller_loss_task(void *arg, int pending); 39 static void nvmf_disconnect_task(void *arg, int pending); 40 static void nvmf_request_reconnect(struct nvmf_softc *sc); 41 static void nvmf_request_reconnect_task(void *arg, int pending); 42 static void nvmf_shutdown_pre_sync(void *arg, int howto); 43 static void nvmf_shutdown_post_sync(void *arg, int howto); 44 45 void 46 nvmf_complete(void *arg, const struct nvme_completion *cqe) 47 { 48 struct nvmf_completion_status *status = arg; 49 struct mtx *mtx; 50 51 status->cqe = *cqe; 52 mtx = mtx_pool_find(mtxpool_sleep, status); 53 mtx_lock(mtx); 54 status->done = true; 55 mtx_unlock(mtx); 56 wakeup(status); 57 } 58 59 void 60 nvmf_io_complete(void *arg, size_t xfered, int error) 61 { 62 struct nvmf_completion_status *status = arg; 63 struct mtx *mtx; 64 65 status->io_error = error; 66 mtx = mtx_pool_find(mtxpool_sleep, status); 67 mtx_lock(mtx); 68 status->io_done = true; 69 mtx_unlock(mtx); 70 wakeup(status); 71 } 72 73 void 74 nvmf_wait_for_reply(struct nvmf_completion_status *status) 75 { 76 struct mtx *mtx; 77 78 mtx = mtx_pool_find(mtxpool_sleep, status); 79 mtx_lock(mtx); 80 while (!status->done || !status->io_done) 81 mtx_sleep(status, mtx, 0, "nvmfcmd", 0); 82 mtx_unlock(mtx); 83 } 84 85 static int 86 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 87 uint64_t *value) 88 { 89 const struct nvmf_fabric_prop_get_rsp *rsp; 90 struct nvmf_completion_status status; 91 92 nvmf_status_init(&status); 93 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, 94 M_WAITOK)) 95 return (ECONNABORTED); 96 nvmf_wait_for_reply(&status); 97 98 if (status.cqe.status != 0) { 99 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", 100 le16toh(status.cqe.status)); 101 return (EIO); 102 } 103 104 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; 105 if (size == 8) 106 *value = le64toh(rsp->value.u64); 107 else 108 *value = le32toh(rsp->value.u32.low); 109 return (0); 110 } 111 112 static int 113 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, 114 uint64_t value) 115 { 116 struct nvmf_completion_status status; 117 118 nvmf_status_init(&status); 119 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, 120 M_WAITOK)) 121 return (ECONNABORTED); 122 nvmf_wait_for_reply(&status); 123 124 if (status.cqe.status != 0) { 125 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", 126 le16toh(status.cqe.status)); 127 return (EIO); 128 } 129 return (0); 130 } 131 132 static void 133 nvmf_shutdown_controller(struct nvmf_softc *sc) 134 { 135 uint64_t cc; 136 int error; 137 138 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); 139 if (error != 0) { 140 device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); 141 return; 142 } 143 144 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); 145 146 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); 147 if (error != 0) 148 device_printf(sc->dev, 149 "Failed to set CC to trigger shutdown\n"); 150 } 151 152 static void 153 nvmf_check_keep_alive(void *arg) 154 { 155 struct nvmf_softc *sc = arg; 156 int traffic; 157 158 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); 159 if (traffic == 0) { 160 device_printf(sc->dev, 161 "disconnecting due to KeepAlive timeout\n"); 162 nvmf_disconnect(sc); 163 return; 164 } 165 166 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); 167 } 168 169 static void 170 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) 171 { 172 struct nvmf_softc *sc = arg; 173 174 atomic_store_int(&sc->ka_active_rx_traffic, 1); 175 if (cqe->status != 0) { 176 device_printf(sc->dev, 177 "KeepAlive response reported status %#x\n", 178 le16toh(cqe->status)); 179 } 180 } 181 182 static void 183 nvmf_send_keep_alive(void *arg) 184 { 185 struct nvmf_softc *sc = arg; 186 int traffic; 187 188 /* 189 * Don't bother sending a KeepAlive command if TKAS is active 190 * and another command has been sent during the interval. 191 */ 192 traffic = atomic_load_int(&sc->ka_active_tx_traffic); 193 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, 194 sc, M_NOWAIT)) 195 device_printf(sc->dev, 196 "Failed to allocate KeepAlive command\n"); 197 198 /* Clear ka_active_tx_traffic after sending the keep alive command. */ 199 atomic_store_int(&sc->ka_active_tx_traffic, 0); 200 201 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); 202 } 203 204 int 205 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) 206 { 207 const struct nvme_discovery_log_entry *dle; 208 const struct nvme_controller_data *cdata; 209 const nvlist_t *const *io; 210 const nvlist_t *admin, *rparams; 211 nvlist_t *nvl; 212 size_t i, num_io_queues; 213 uint32_t qsize; 214 int error; 215 216 error = nvmf_unpack_ioc_nvlist(nv, &nvl); 217 if (error != 0) 218 return (error); 219 220 if (!nvlist_exists_number(nvl, "trtype") || 221 !nvlist_exists_nvlist(nvl, "admin") || 222 !nvlist_exists_nvlist_array(nvl, "io") || 223 !nvlist_exists_binary(nvl, "cdata") || 224 !nvlist_exists_nvlist(nvl, "rparams")) 225 goto invalid; 226 227 rparams = nvlist_get_nvlist(nvl, "rparams"); 228 if (!nvlist_exists_binary(rparams, "dle") || 229 !nvlist_exists_string(rparams, "hostnqn") || 230 !nvlist_exists_number(rparams, "num_io_queues") || 231 !nvlist_exists_number(rparams, "io_qsize")) 232 goto invalid; 233 234 admin = nvlist_get_nvlist(nvl, "admin"); 235 if (!nvmf_validate_qpair_nvlist(admin, false)) 236 goto invalid; 237 if (!nvlist_get_bool(admin, "admin")) 238 goto invalid; 239 240 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 241 if (num_io_queues < 1 || 242 num_io_queues != nvlist_get_number(rparams, "num_io_queues")) 243 goto invalid; 244 for (i = 0; i < num_io_queues; i++) { 245 if (!nvmf_validate_qpair_nvlist(io[i], false)) 246 goto invalid; 247 } 248 249 /* Require all I/O queues to be the same size. */ 250 qsize = nvlist_get_number(rparams, "io_qsize"); 251 for (i = 0; i < num_io_queues; i++) { 252 if (nvlist_get_number(io[i], "qsize") != qsize) 253 goto invalid; 254 } 255 256 cdata = nvlist_get_binary(nvl, "cdata", &i); 257 if (i != sizeof(*cdata)) 258 goto invalid; 259 dle = nvlist_get_binary(rparams, "dle", &i); 260 if (i != sizeof(*dle)) 261 goto invalid; 262 263 if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0) 264 goto invalid; 265 266 *nvlp = nvl; 267 return (0); 268 invalid: 269 nvlist_destroy(nvl); 270 return (EINVAL); 271 } 272 273 static int 274 nvmf_probe(device_t dev) 275 { 276 const nvlist_t *nvl = device_get_ivars(dev); 277 const struct nvme_controller_data *cdata; 278 279 if (nvl == NULL) 280 return (ENXIO); 281 282 cdata = nvlist_get_binary(nvl, "cdata", NULL); 283 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); 284 return (BUS_PROBE_DEFAULT); 285 } 286 287 static int 288 nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl) 289 { 290 const nvlist_t *const *io; 291 const nvlist_t *admin; 292 uint64_t kato; 293 size_t num_io_queues; 294 enum nvmf_trtype trtype; 295 char name[16]; 296 297 trtype = nvlist_get_number(nvl, "trtype"); 298 admin = nvlist_get_nvlist(nvl, "admin"); 299 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); 300 kato = dnvlist_get_number(nvl, "kato", 0); 301 sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0); 302 sc->controller_loss_timeout = dnvlist_get_number(nvl, 303 "controller_loss_timeout", 0); 304 305 /* Setup the admin queue. */ 306 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); 307 if (sc->admin == NULL) { 308 device_printf(sc->dev, "Failed to setup admin queue\n"); 309 return (ENXIO); 310 } 311 312 /* Setup I/O queues. */ 313 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, 314 M_WAITOK | M_ZERO); 315 sc->num_io_queues = num_io_queues; 316 for (u_int i = 0; i < sc->num_io_queues; i++) { 317 snprintf(name, sizeof(name), "I/O queue %u", i); 318 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); 319 if (sc->io[i] == NULL) { 320 device_printf(sc->dev, "Failed to setup I/O queue %u\n", 321 i); 322 return (ENXIO); 323 } 324 } 325 326 /* Start KeepAlive timers. */ 327 if (kato != 0) { 328 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, 329 sc->cdata->ctratt) != 0; 330 sc->ka_rx_sbt = mstosbt(kato); 331 sc->ka_tx_sbt = sc->ka_rx_sbt / 2; 332 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, 333 nvmf_check_keep_alive, sc, C_HARDCLOCK); 334 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, 335 nvmf_send_keep_alive, sc, C_HARDCLOCK); 336 } 337 338 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), 339 sizeof(*sc->cdata)); 340 341 /* Save reconnect parameters. */ 342 nvlist_destroy(sc->rparams); 343 sc->rparams = nvlist_take_nvlist(nvl, "rparams"); 344 345 return (0); 346 } 347 348 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, 349 const struct nvme_namespace_data *, void *); 350 351 static bool 352 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, 353 struct nvme_namespace_data *data, uint32_t *nsidp, 354 nvmf_scan_active_ns_cb *cb, void *cb_arg) 355 { 356 struct nvmf_completion_status status; 357 uint32_t nsid; 358 359 nvmf_status_init(&status); 360 nvmf_status_wait_io(&status); 361 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, 362 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { 363 device_printf(sc->dev, 364 "failed to send IDENTIFY active namespaces command\n"); 365 return (false); 366 } 367 nvmf_wait_for_reply(&status); 368 369 if (status.cqe.status != 0) { 370 device_printf(sc->dev, 371 "IDENTIFY active namespaces failed, status %#x\n", 372 le16toh(status.cqe.status)); 373 return (false); 374 } 375 376 if (status.io_error != 0) { 377 device_printf(sc->dev, 378 "IDENTIFY active namespaces failed with I/O error %d\n", 379 status.io_error); 380 return (false); 381 } 382 383 for (u_int i = 0; i < nitems(nslist->ns); i++) { 384 nsid = nslist->ns[i]; 385 if (nsid == 0) { 386 *nsidp = 0; 387 return (true); 388 } 389 390 nvmf_status_init(&status); 391 nvmf_status_wait_io(&status); 392 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 393 &status, nvmf_io_complete, &status, M_WAITOK)) { 394 device_printf(sc->dev, 395 "failed to send IDENTIFY namespace %u command\n", 396 nsid); 397 return (false); 398 } 399 nvmf_wait_for_reply(&status); 400 401 if (status.cqe.status != 0) { 402 device_printf(sc->dev, 403 "IDENTIFY namespace %u failed, status %#x\n", nsid, 404 le16toh(status.cqe.status)); 405 return (false); 406 } 407 408 if (status.io_error != 0) { 409 device_printf(sc->dev, 410 "IDENTIFY namespace %u failed with I/O error %d\n", 411 nsid, status.io_error); 412 return (false); 413 } 414 415 nvme_namespace_data_swapbytes(data); 416 if (!cb(sc, nsid, data, cb_arg)) 417 return (false); 418 } 419 420 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); 421 422 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) 423 *nsidp = 0; 424 else 425 *nsidp = nsid; 426 return (true); 427 } 428 429 static bool 430 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, 431 void *cb_arg) 432 { 433 struct nvme_namespace_data *data; 434 struct nvme_ns_list *nslist; 435 uint32_t nsid; 436 bool retval; 437 438 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); 439 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 440 441 nsid = 0; 442 retval = true; 443 for (;;) { 444 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, 445 cb_arg)) { 446 retval = false; 447 break; 448 } 449 if (nsid == 0) 450 break; 451 } 452 453 free(data, M_NVMF); 454 free(nslist, M_NVMF); 455 return (retval); 456 } 457 458 static bool 459 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, 460 const struct nvme_namespace_data *data, void *arg __unused) 461 { 462 if (sc->ns[nsid - 1] != NULL) { 463 device_printf(sc->dev, 464 "duplicate namespace %u in active namespace list\n", 465 nsid); 466 return (false); 467 } 468 469 /* 470 * As in nvme_ns_construct, a size of zero indicates an 471 * invalid namespace. 472 */ 473 if (data->nsze == 0) { 474 device_printf(sc->dev, 475 "ignoring active namespace %u with zero size\n", nsid); 476 return (true); 477 } 478 479 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 480 481 nvmf_sim_rescan_ns(sc, nsid); 482 return (true); 483 } 484 485 static bool 486 nvmf_add_namespaces(struct nvmf_softc *sc) 487 { 488 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, 489 M_WAITOK | M_ZERO); 490 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); 491 } 492 493 static int 494 nvmf_attach(device_t dev) 495 { 496 struct make_dev_args mda; 497 struct nvmf_softc *sc = device_get_softc(dev); 498 nvlist_t *nvl = device_get_ivars(dev); 499 const nvlist_t * const *io; 500 struct sysctl_oid *oid; 501 uint64_t mpsmin, val; 502 u_int i; 503 int error; 504 505 if (nvl == NULL) 506 return (ENXIO); 507 508 sc->dev = dev; 509 sc->trtype = nvlist_get_number(nvl, "trtype"); 510 callout_init(&sc->ka_rx_timer, 1); 511 callout_init(&sc->ka_tx_timer, 1); 512 sx_init(&sc->connection_lock, "nvmf connection"); 513 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); 514 TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0, 515 nvmf_controller_loss_task, sc); 516 TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0, 517 nvmf_request_reconnect_task, sc); 518 519 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), 520 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", 521 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); 522 sc->ioq_oid_list = SYSCTL_CHILDREN(oid); 523 524 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); 525 526 nvmf_init_aer(sc); 527 528 error = nvmf_establish_connection(sc, nvl); 529 if (error != 0) 530 goto out; 531 532 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); 533 if (error != 0) { 534 device_printf(sc->dev, "Failed to fetch CAP\n"); 535 error = ENXIO; 536 goto out; 537 } 538 539 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); 540 if (error != 0) { 541 device_printf(sc->dev, "Failed to fetch VS\n"); 542 error = ENXIO; 543 goto out; 544 } 545 sc->vs = val; 546 547 /* Honor MDTS if it is set. */ 548 mpsmin = (uint64_t)1 << (NVME_MPS_SHIFT + 549 NVME_CAP_HI_MPSMIN(sc->cap >> 32)); 550 sc->max_xfer_size = maxphys; 551 if (sc->cdata->mdts != 0) { 552 sc->max_xfer_size = ulmin(sc->max_xfer_size, 553 mpsmin << sc->cdata->mdts); 554 } 555 556 /* Honor any transfer size restriction imposed by the transport. */ 557 val = nvmf_max_xfer_size_qp(sc->io[0]); 558 if (val >= mpsmin) 559 sc->max_xfer_size = ulmin(sc->max_xfer_size, 560 rounddown2(val, mpsmin)); 561 562 io = nvlist_get_nvlist_array(nvl, "io", NULL); 563 sc->max_pending_io = nvlist_get_number(io[0], "qsize") * 564 sc->num_io_queues; 565 566 error = nvmf_init_sim(sc); 567 if (error != 0) 568 goto out; 569 570 error = nvmf_start_aer(sc); 571 if (error != 0) { 572 nvmf_destroy_sim(sc); 573 goto out; 574 } 575 576 if (!nvmf_add_namespaces(sc)) { 577 nvmf_destroy_sim(sc); 578 goto out; 579 } 580 581 make_dev_args_init(&mda); 582 mda.mda_devsw = &nvmf_cdevsw; 583 mda.mda_uid = UID_ROOT; 584 mda.mda_gid = GID_WHEEL; 585 mda.mda_mode = 0600; 586 mda.mda_si_drv1 = sc; 587 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); 588 if (error != 0) { 589 nvmf_destroy_sim(sc); 590 goto out; 591 } 592 593 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, 594 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); 595 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, 596 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST); 597 598 return (0); 599 out: 600 if (sc->ns != NULL) { 601 for (i = 0; i < sc->cdata->nn; i++) { 602 if (sc->ns[i] != NULL) 603 nvmf_destroy_ns(sc->ns[i]); 604 } 605 free(sc->ns, M_NVMF); 606 } 607 608 callout_drain(&sc->ka_tx_timer); 609 callout_drain(&sc->ka_rx_timer); 610 611 if (sc->admin != NULL) 612 nvmf_shutdown_controller(sc); 613 614 for (i = 0; i < sc->num_io_queues; i++) { 615 if (sc->io[i] != NULL) 616 nvmf_destroy_qp(sc->io[i]); 617 } 618 free(sc->io, M_NVMF); 619 if (sc->admin != NULL) 620 nvmf_destroy_qp(sc->admin); 621 622 nvmf_destroy_aer(sc); 623 624 taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); 625 taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task); 626 taskqueue_drain(nvmf_tq, &sc->disconnect_task); 627 sx_destroy(&sc->connection_lock); 628 nvlist_destroy(sc->rparams); 629 free(sc->cdata, M_NVMF); 630 return (error); 631 } 632 633 void 634 nvmf_disconnect(struct nvmf_softc *sc) 635 { 636 taskqueue_enqueue(nvmf_tq, &sc->disconnect_task); 637 } 638 639 static void 640 nvmf_disconnect_task(void *arg, int pending __unused) 641 { 642 struct nvmf_softc *sc = arg; 643 u_int i; 644 645 sx_xlock(&sc->connection_lock); 646 if (sc->admin == NULL) { 647 /* 648 * Ignore transport errors if there is no active 649 * association. 650 */ 651 sx_xunlock(&sc->connection_lock); 652 return; 653 } 654 655 if (sc->detaching) { 656 if (sc->admin != NULL) { 657 /* 658 * This unsticks the detach process if a 659 * transport error occurs during detach. 660 */ 661 nvmf_shutdown_qp(sc->admin); 662 } 663 sx_xunlock(&sc->connection_lock); 664 return; 665 } 666 667 if (sc->cdev == NULL) { 668 /* 669 * Transport error occurred during attach (nvmf_add_namespaces). 670 * Shutdown the admin queue. 671 */ 672 nvmf_shutdown_qp(sc->admin); 673 sx_xunlock(&sc->connection_lock); 674 return; 675 } 676 677 nanotime(&sc->last_disconnect); 678 callout_drain(&sc->ka_tx_timer); 679 callout_drain(&sc->ka_rx_timer); 680 sc->ka_traffic = false; 681 682 /* Quiesce namespace consumers. */ 683 nvmf_disconnect_sim(sc); 684 for (i = 0; i < sc->cdata->nn; i++) { 685 if (sc->ns[i] != NULL) 686 nvmf_disconnect_ns(sc->ns[i]); 687 } 688 689 /* Shutdown the existing qpairs. */ 690 for (i = 0; i < sc->num_io_queues; i++) { 691 nvmf_destroy_qp(sc->io[i]); 692 } 693 free(sc->io, M_NVMF); 694 sc->io = NULL; 695 sc->num_io_queues = 0; 696 nvmf_destroy_qp(sc->admin); 697 sc->admin = NULL; 698 699 if (sc->reconnect_delay != 0) 700 nvmf_request_reconnect(sc); 701 if (sc->controller_loss_timeout != 0) 702 taskqueue_enqueue_timeout(nvmf_tq, 703 &sc->controller_loss_task, sc->controller_loss_timeout * 704 hz); 705 706 sx_xunlock(&sc->connection_lock); 707 } 708 709 static void 710 nvmf_controller_loss_task(void *arg, int pending) 711 { 712 struct nvmf_softc *sc = arg; 713 device_t dev; 714 int error; 715 716 bus_topo_lock(); 717 sx_xlock(&sc->connection_lock); 718 if (sc->admin != NULL || sc->detaching) { 719 /* Reconnected or already detaching. */ 720 sx_xunlock(&sc->connection_lock); 721 bus_topo_unlock(); 722 return; 723 } 724 725 sc->controller_timedout = true; 726 sx_xunlock(&sc->connection_lock); 727 728 /* 729 * XXX: Doing this from here is a bit ugly. We don't have an 730 * extra reference on `dev` but bus_topo_lock should block any 731 * concurrent device_delete_child invocations. 732 */ 733 dev = sc->dev; 734 error = device_delete_child(root_bus, dev); 735 if (error != 0) 736 device_printf(dev, 737 "failed to detach after controller loss: %d\n", error); 738 bus_topo_unlock(); 739 } 740 741 static void 742 nvmf_request_reconnect(struct nvmf_softc *sc) 743 { 744 char buf[64]; 745 746 sx_assert(&sc->connection_lock, SX_LOCKED); 747 748 snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev)); 749 devctl_notify("nvme", "controller", "RECONNECT", buf); 750 taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task, 751 sc->reconnect_delay * hz); 752 } 753 754 static void 755 nvmf_request_reconnect_task(void *arg, int pending) 756 { 757 struct nvmf_softc *sc = arg; 758 759 sx_xlock(&sc->connection_lock); 760 if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { 761 /* Reconnected or already detaching. */ 762 sx_xunlock(&sc->connection_lock); 763 return; 764 } 765 766 nvmf_request_reconnect(sc); 767 sx_xunlock(&sc->connection_lock); 768 } 769 770 static int 771 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 772 { 773 const struct nvme_controller_data *cdata; 774 nvlist_t *nvl; 775 u_int i; 776 int error; 777 778 error = nvmf_copyin_handoff(nv, &nvl); 779 if (error != 0) 780 return (error); 781 782 /* XXX: Should we permit changing the transport type? */ 783 if (sc->trtype != nvlist_get_number(nvl, "trtype")) { 784 device_printf(sc->dev, 785 "transport type mismatch on reconnect\n"); 786 return (EINVAL); 787 } 788 789 sx_xlock(&sc->connection_lock); 790 if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { 791 error = EBUSY; 792 goto out; 793 } 794 795 /* 796 * Ensure this is for the same controller. Note that the 797 * controller ID can vary across associations if the remote 798 * system is using the dynamic controller model. This merely 799 * ensures the new association is connected to the same NVMe 800 * subsystem. 801 */ 802 cdata = nvlist_get_binary(nvl, "cdata", NULL); 803 if (memcmp(sc->cdata->subnqn, cdata->subnqn, 804 sizeof(cdata->subnqn)) != 0) { 805 device_printf(sc->dev, 806 "controller subsystem NQN mismatch on reconnect\n"); 807 error = EINVAL; 808 goto out; 809 } 810 811 /* 812 * XXX: Require same number and size of I/O queues so that 813 * max_pending_io is still correct? 814 */ 815 816 error = nvmf_establish_connection(sc, nvl); 817 if (error != 0) 818 goto out; 819 820 error = nvmf_start_aer(sc); 821 if (error != 0) 822 goto out; 823 824 device_printf(sc->dev, 825 "established new association with %u I/O queues\n", 826 sc->num_io_queues); 827 828 /* Restart namespace consumers. */ 829 for (i = 0; i < sc->cdata->nn; i++) { 830 if (sc->ns[i] != NULL) 831 nvmf_reconnect_ns(sc->ns[i]); 832 } 833 nvmf_reconnect_sim(sc); 834 835 nvmf_rescan_all_ns(sc); 836 837 taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL); 838 taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL); 839 out: 840 sx_xunlock(&sc->connection_lock); 841 nvlist_destroy(nvl); 842 return (error); 843 } 844 845 static void 846 nvmf_shutdown_pre_sync(void *arg, int howto) 847 { 848 struct nvmf_softc *sc = arg; 849 850 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 851 return; 852 853 /* 854 * If this association is disconnected, abort any pending 855 * requests with an error to permit filesystems to unmount 856 * without hanging. 857 */ 858 sx_xlock(&sc->connection_lock); 859 if (sc->admin != NULL || sc->detaching) { 860 sx_xunlock(&sc->connection_lock); 861 return; 862 } 863 864 for (u_int i = 0; i < sc->cdata->nn; i++) { 865 if (sc->ns[i] != NULL) 866 nvmf_shutdown_ns(sc->ns[i]); 867 } 868 nvmf_shutdown_sim(sc); 869 sx_xunlock(&sc->connection_lock); 870 } 871 872 static void 873 nvmf_shutdown_post_sync(void *arg, int howto) 874 { 875 struct nvmf_softc *sc = arg; 876 877 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) 878 return; 879 880 /* 881 * If this association is connected, disconnect gracefully. 882 */ 883 sx_xlock(&sc->connection_lock); 884 if (sc->admin == NULL || sc->detaching) { 885 sx_xunlock(&sc->connection_lock); 886 return; 887 } 888 889 callout_drain(&sc->ka_tx_timer); 890 callout_drain(&sc->ka_rx_timer); 891 892 nvmf_shutdown_controller(sc); 893 894 /* 895 * Quiesce consumers so that any commands submitted after this 896 * fail with an error. Notably, nda(4) calls nda_flush() from 897 * a post_sync handler that might be ordered after this one. 898 */ 899 for (u_int i = 0; i < sc->cdata->nn; i++) { 900 if (sc->ns[i] != NULL) 901 nvmf_shutdown_ns(sc->ns[i]); 902 } 903 nvmf_shutdown_sim(sc); 904 905 for (u_int i = 0; i < sc->num_io_queues; i++) { 906 nvmf_destroy_qp(sc->io[i]); 907 } 908 nvmf_destroy_qp(sc->admin); 909 sc->admin = NULL; 910 sx_xunlock(&sc->connection_lock); 911 } 912 913 static int 914 nvmf_detach(device_t dev) 915 { 916 struct nvmf_softc *sc = device_get_softc(dev); 917 u_int i; 918 919 destroy_dev(sc->cdev); 920 921 sx_xlock(&sc->connection_lock); 922 sc->detaching = true; 923 sx_xunlock(&sc->connection_lock); 924 925 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); 926 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); 927 928 nvmf_destroy_sim(sc); 929 for (i = 0; i < sc->cdata->nn; i++) { 930 if (sc->ns[i] != NULL) 931 nvmf_destroy_ns(sc->ns[i]); 932 } 933 free(sc->ns, M_NVMF); 934 935 callout_drain(&sc->ka_tx_timer); 936 callout_drain(&sc->ka_rx_timer); 937 938 if (sc->admin != NULL) 939 nvmf_shutdown_controller(sc); 940 941 for (i = 0; i < sc->num_io_queues; i++) { 942 nvmf_destroy_qp(sc->io[i]); 943 } 944 free(sc->io, M_NVMF); 945 946 taskqueue_drain(nvmf_tq, &sc->disconnect_task); 947 if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, 948 NULL) != 0) 949 taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); 950 951 /* 952 * Don't cancel/drain the controller loss task if that task 953 * has fired and is triggering the detach. 954 */ 955 if (!sc->controller_timedout) { 956 if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, 957 NULL) != 0) 958 taskqueue_drain_timeout(nvmf_tq, 959 &sc->controller_loss_task); 960 } 961 962 if (sc->admin != NULL) 963 nvmf_destroy_qp(sc->admin); 964 965 nvmf_destroy_aer(sc); 966 967 sx_destroy(&sc->connection_lock); 968 nvlist_destroy(sc->rparams); 969 free(sc->cdata, M_NVMF); 970 return (0); 971 } 972 973 static void 974 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, 975 const struct nvme_namespace_data *data) 976 { 977 struct nvmf_namespace *ns; 978 979 /* XXX: Needs locking around sc->ns[]. */ 980 ns = sc->ns[nsid - 1]; 981 if (data->nsze == 0) { 982 /* XXX: Needs locking */ 983 if (ns != NULL) { 984 nvmf_destroy_ns(ns); 985 sc->ns[nsid - 1] = NULL; 986 } 987 } else { 988 /* XXX: Needs locking */ 989 if (ns == NULL) { 990 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); 991 } else { 992 if (!nvmf_update_ns(ns, data)) { 993 nvmf_destroy_ns(ns); 994 sc->ns[nsid - 1] = NULL; 995 } 996 } 997 } 998 999 nvmf_sim_rescan_ns(sc, nsid); 1000 } 1001 1002 void 1003 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) 1004 { 1005 struct nvmf_completion_status status; 1006 struct nvme_namespace_data *data; 1007 1008 data = malloc(sizeof(*data), M_NVMF, M_WAITOK); 1009 1010 nvmf_status_init(&status); 1011 nvmf_status_wait_io(&status); 1012 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, 1013 &status, nvmf_io_complete, &status, M_WAITOK)) { 1014 device_printf(sc->dev, 1015 "failed to send IDENTIFY namespace %u command\n", nsid); 1016 free(data, M_NVMF); 1017 return; 1018 } 1019 nvmf_wait_for_reply(&status); 1020 1021 if (status.cqe.status != 0) { 1022 device_printf(sc->dev, 1023 "IDENTIFY namespace %u failed, status %#x\n", nsid, 1024 le16toh(status.cqe.status)); 1025 free(data, M_NVMF); 1026 return; 1027 } 1028 1029 if (status.io_error != 0) { 1030 device_printf(sc->dev, 1031 "IDENTIFY namespace %u failed with I/O error %d\n", 1032 nsid, status.io_error); 1033 free(data, M_NVMF); 1034 return; 1035 } 1036 1037 nvme_namespace_data_swapbytes(data); 1038 1039 nvmf_rescan_ns_1(sc, nsid, data); 1040 1041 free(data, M_NVMF); 1042 } 1043 1044 static void 1045 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, 1046 uint32_t next_valid_nsid) 1047 { 1048 struct nvmf_namespace *ns; 1049 1050 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) 1051 { 1052 /* XXX: Needs locking around sc->ns[]. */ 1053 ns = sc->ns[nsid - 1]; 1054 if (ns != NULL) { 1055 nvmf_destroy_ns(ns); 1056 sc->ns[nsid - 1] = NULL; 1057 1058 nvmf_sim_rescan_ns(sc, nsid); 1059 } 1060 } 1061 } 1062 1063 static bool 1064 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, 1065 const struct nvme_namespace_data *data, void *arg) 1066 { 1067 uint32_t *last_nsid = arg; 1068 1069 /* Check for any gaps prior to this namespace. */ 1070 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); 1071 *last_nsid = nsid; 1072 1073 nvmf_rescan_ns_1(sc, nsid, data); 1074 return (true); 1075 } 1076 1077 void 1078 nvmf_rescan_all_ns(struct nvmf_softc *sc) 1079 { 1080 uint32_t last_nsid; 1081 1082 last_nsid = 0; 1083 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) 1084 return; 1085 1086 /* 1087 * Check for any namespace devices after the last active 1088 * namespace. 1089 */ 1090 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); 1091 } 1092 1093 int 1094 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, 1095 bool admin) 1096 { 1097 struct nvmf_completion_status status; 1098 struct nvme_command cmd; 1099 struct memdesc mem; 1100 struct nvmf_host_qpair *qp; 1101 struct nvmf_request *req; 1102 void *buf; 1103 int error; 1104 1105 if (pt->len > sc->max_xfer_size) 1106 return (EINVAL); 1107 1108 buf = NULL; 1109 if (pt->len != 0) { 1110 /* 1111 * XXX: Depending on the size we may want to pin the 1112 * user pages and use a memdesc with vm_page_t's 1113 * instead. 1114 */ 1115 buf = malloc(pt->len, M_NVMF, M_WAITOK); 1116 if (pt->is_read == 0) { 1117 error = copyin(pt->buf, buf, pt->len); 1118 if (error != 0) { 1119 free(buf, M_NVMF); 1120 return (error); 1121 } 1122 } else { 1123 /* Ensure no kernel data is leaked to userland. */ 1124 memset(buf, 0, pt->len); 1125 } 1126 } 1127 1128 memset(&cmd, 0, sizeof(cmd)); 1129 cmd.opc = pt->cmd.opc; 1130 cmd.fuse = pt->cmd.fuse; 1131 cmd.nsid = pt->cmd.nsid; 1132 cmd.cdw10 = pt->cmd.cdw10; 1133 cmd.cdw11 = pt->cmd.cdw11; 1134 cmd.cdw12 = pt->cmd.cdw12; 1135 cmd.cdw13 = pt->cmd.cdw13; 1136 cmd.cdw14 = pt->cmd.cdw14; 1137 cmd.cdw15 = pt->cmd.cdw15; 1138 1139 sx_slock(&sc->connection_lock); 1140 if (sc->admin == NULL || sc->detaching) { 1141 device_printf(sc->dev, 1142 "failed to send passthrough command\n"); 1143 error = ECONNABORTED; 1144 sx_sunlock(&sc->connection_lock); 1145 goto error; 1146 } 1147 if (admin) 1148 qp = sc->admin; 1149 else 1150 qp = nvmf_select_io_queue(sc); 1151 nvmf_status_init(&status); 1152 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); 1153 sx_sunlock(&sc->connection_lock); 1154 if (req == NULL) { 1155 device_printf(sc->dev, "failed to send passthrough command\n"); 1156 error = ECONNABORTED; 1157 goto error; 1158 } 1159 1160 if (pt->len != 0) { 1161 mem = memdesc_vaddr(buf, pt->len); 1162 nvmf_capsule_append_data(req->nc, &mem, pt->len, 1163 pt->is_read == 0, nvmf_io_complete, &status); 1164 nvmf_status_wait_io(&status); 1165 } 1166 1167 nvmf_submit_request(req); 1168 nvmf_wait_for_reply(&status); 1169 1170 memset(&pt->cpl, 0, sizeof(pt->cpl)); 1171 pt->cpl.cdw0 = status.cqe.cdw0; 1172 pt->cpl.status = status.cqe.status; 1173 1174 error = status.io_error; 1175 if (error == 0 && pt->len != 0 && pt->is_read != 0) 1176 error = copyout(buf, pt->buf, pt->len); 1177 error: 1178 free(buf, M_NVMF); 1179 return (error); 1180 } 1181 1182 static int 1183 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1184 { 1185 int error; 1186 1187 sx_slock(&sc->connection_lock); 1188 error = nvmf_pack_ioc_nvlist(sc->rparams, nv); 1189 sx_sunlock(&sc->connection_lock); 1190 1191 return (error); 1192 } 1193 1194 static int 1195 nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) 1196 { 1197 nvlist_t *nvl, *nvl_ts; 1198 int error; 1199 1200 nvl = nvlist_create(0); 1201 nvl_ts = nvlist_create(0); 1202 1203 sx_slock(&sc->connection_lock); 1204 nvlist_add_bool(nvl, "connected", sc->admin != NULL); 1205 nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec); 1206 nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec); 1207 sx_sunlock(&sc->connection_lock); 1208 nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts); 1209 1210 error = nvmf_pack_ioc_nvlist(nvl, nv); 1211 nvlist_destroy(nvl); 1212 return (error); 1213 } 1214 1215 static int 1216 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1217 struct thread *td) 1218 { 1219 struct nvmf_softc *sc = cdev->si_drv1; 1220 struct nvme_get_nsid *gnsid; 1221 struct nvme_pt_command *pt; 1222 struct nvmf_ioc_nv *nv; 1223 1224 switch (cmd) { 1225 case NVME_PASSTHROUGH_CMD: 1226 pt = (struct nvme_pt_command *)arg; 1227 return (nvmf_passthrough_cmd(sc, pt, true)); 1228 case NVME_GET_NSID: 1229 gnsid = (struct nvme_get_nsid *)arg; 1230 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), 1231 sizeof(gnsid->cdev)); 1232 gnsid->nsid = 0; 1233 return (0); 1234 case NVME_GET_MAX_XFER_SIZE: 1235 *(uint64_t *)arg = sc->max_xfer_size; 1236 return (0); 1237 case NVME_GET_CONTROLLER_DATA: 1238 memcpy(arg, sc->cdata, sizeof(*sc->cdata)); 1239 return (0); 1240 case DIOCGIDENT: 1241 nvme_cdata_get_disk_ident(sc->cdata, (uint8_t *)arg); 1242 return (0); 1243 case NVMF_RECONNECT_PARAMS: 1244 nv = (struct nvmf_ioc_nv *)arg; 1245 return (nvmf_reconnect_params(sc, nv)); 1246 case NVMF_RECONNECT_HOST: 1247 nv = (struct nvmf_ioc_nv *)arg; 1248 return (nvmf_reconnect_host(sc, nv)); 1249 case NVMF_CONNECTION_STATUS: 1250 nv = (struct nvmf_ioc_nv *)arg; 1251 return (nvmf_connection_status(sc, nv)); 1252 default: 1253 return (ENOTTY); 1254 } 1255 } 1256 1257 static struct cdevsw nvmf_cdevsw = { 1258 .d_version = D_VERSION, 1259 .d_ioctl = nvmf_ioctl 1260 }; 1261 1262 static int 1263 nvmf_modevent(module_t mod, int what, void *arg) 1264 { 1265 int error; 1266 1267 switch (what) { 1268 case MOD_LOAD: 1269 error = nvmf_ctl_load(); 1270 if (error != 0) 1271 return (error); 1272 1273 nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO, 1274 taskqueue_thread_enqueue, &nvmf_tq); 1275 taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq"); 1276 return (0); 1277 case MOD_QUIESCE: 1278 return (0); 1279 case MOD_UNLOAD: 1280 nvmf_ctl_unload(); 1281 destroy_dev_drain(&nvmf_cdevsw); 1282 if (nvmf_tq != NULL) 1283 taskqueue_free(nvmf_tq); 1284 return (0); 1285 default: 1286 return (EOPNOTSUPP); 1287 } 1288 } 1289 1290 static device_method_t nvmf_methods[] = { 1291 /* Device interface */ 1292 DEVMETHOD(device_probe, nvmf_probe), 1293 DEVMETHOD(device_attach, nvmf_attach), 1294 DEVMETHOD(device_detach, nvmf_detach), 1295 DEVMETHOD_END 1296 }; 1297 1298 driver_t nvme_nvmf_driver = { 1299 "nvme", 1300 nvmf_methods, 1301 sizeof(struct nvmf_softc), 1302 }; 1303 1304 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); 1305 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); 1306