1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/dnv.h>
12 #include <sys/eventhandler.h>
13 #include <sys/lock.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/module.h>
18 #include <sys/mutex.h>
19 #include <sys/nv.h>
20 #include <sys/reboot.h>
21 #include <sys/sx.h>
22 #include <sys/sysctl.h>
23 #include <sys/taskqueue.h>
24 #include <dev/nvme/nvme.h>
25 #include <dev/nvmf/nvmf.h>
26 #include <dev/nvmf/nvmf_transport.h>
27 #include <dev/nvmf/host/nvmf_var.h>
28
29 static struct cdevsw nvmf_cdevsw;
30
31 bool nvmf_fail_disconnect = false;
32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
33 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
34
35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
36
37 static void nvmf_disconnect_task(void *arg, int pending);
38 static void nvmf_shutdown_pre_sync(void *arg, int howto);
39 static void nvmf_shutdown_post_sync(void *arg, int howto);
40
41 void
nvmf_complete(void * arg,const struct nvme_completion * cqe)42 nvmf_complete(void *arg, const struct nvme_completion *cqe)
43 {
44 struct nvmf_completion_status *status = arg;
45 struct mtx *mtx;
46
47 status->cqe = *cqe;
48 mtx = mtx_pool_find(mtxpool_sleep, status);
49 mtx_lock(mtx);
50 status->done = true;
51 mtx_unlock(mtx);
52 wakeup(status);
53 }
54
55 void
nvmf_io_complete(void * arg,size_t xfered,int error)56 nvmf_io_complete(void *arg, size_t xfered, int error)
57 {
58 struct nvmf_completion_status *status = arg;
59 struct mtx *mtx;
60
61 status->io_error = error;
62 mtx = mtx_pool_find(mtxpool_sleep, status);
63 mtx_lock(mtx);
64 status->io_done = true;
65 mtx_unlock(mtx);
66 wakeup(status);
67 }
68
69 void
nvmf_wait_for_reply(struct nvmf_completion_status * status)70 nvmf_wait_for_reply(struct nvmf_completion_status *status)
71 {
72 struct mtx *mtx;
73
74 mtx = mtx_pool_find(mtxpool_sleep, status);
75 mtx_lock(mtx);
76 while (!status->done || !status->io_done)
77 mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
78 mtx_unlock(mtx);
79 }
80
81 static int
nvmf_read_property(struct nvmf_softc * sc,uint32_t offset,uint8_t size,uint64_t * value)82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
83 uint64_t *value)
84 {
85 const struct nvmf_fabric_prop_get_rsp *rsp;
86 struct nvmf_completion_status status;
87
88 nvmf_status_init(&status);
89 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
90 M_WAITOK))
91 return (ECONNABORTED);
92 nvmf_wait_for_reply(&status);
93
94 if (status.cqe.status != 0) {
95 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
96 le16toh(status.cqe.status));
97 return (EIO);
98 }
99
100 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
101 if (size == 8)
102 *value = le64toh(rsp->value.u64);
103 else
104 *value = le32toh(rsp->value.u32.low);
105 return (0);
106 }
107
108 static int
nvmf_write_property(struct nvmf_softc * sc,uint32_t offset,uint8_t size,uint64_t value)109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
110 uint64_t value)
111 {
112 struct nvmf_completion_status status;
113
114 nvmf_status_init(&status);
115 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
116 M_WAITOK))
117 return (ECONNABORTED);
118 nvmf_wait_for_reply(&status);
119
120 if (status.cqe.status != 0) {
121 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
122 le16toh(status.cqe.status));
123 return (EIO);
124 }
125 return (0);
126 }
127
128 static void
nvmf_shutdown_controller(struct nvmf_softc * sc)129 nvmf_shutdown_controller(struct nvmf_softc *sc)
130 {
131 uint64_t cc;
132 int error;
133
134 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
135 if (error != 0) {
136 device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
137 return;
138 }
139
140 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
141
142 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
143 if (error != 0)
144 device_printf(sc->dev,
145 "Failed to set CC to trigger shutdown\n");
146 }
147
148 static void
nvmf_check_keep_alive(void * arg)149 nvmf_check_keep_alive(void *arg)
150 {
151 struct nvmf_softc *sc = arg;
152 int traffic;
153
154 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
155 if (traffic == 0) {
156 device_printf(sc->dev,
157 "disconnecting due to KeepAlive timeout\n");
158 nvmf_disconnect(sc);
159 return;
160 }
161
162 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
163 }
164
165 static void
nvmf_keep_alive_complete(void * arg,const struct nvme_completion * cqe)166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
167 {
168 struct nvmf_softc *sc = arg;
169
170 atomic_store_int(&sc->ka_active_rx_traffic, 1);
171 if (cqe->status != 0) {
172 device_printf(sc->dev,
173 "KeepAlive response reported status %#x\n",
174 le16toh(cqe->status));
175 }
176 }
177
178 static void
nvmf_send_keep_alive(void * arg)179 nvmf_send_keep_alive(void *arg)
180 {
181 struct nvmf_softc *sc = arg;
182 int traffic;
183
184 /*
185 * Don't bother sending a KeepAlive command if TKAS is active
186 * and another command has been sent during the interval.
187 */
188 traffic = atomic_load_int(&sc->ka_active_tx_traffic);
189 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
190 sc, M_NOWAIT))
191 device_printf(sc->dev,
192 "Failed to allocate KeepAlive command\n");
193
194 /* Clear ka_active_tx_traffic after sending the keep alive command. */
195 atomic_store_int(&sc->ka_active_tx_traffic, 0);
196
197 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
198 }
199
200 int
nvmf_copyin_handoff(const struct nvmf_ioc_nv * nv,nvlist_t ** nvlp)201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
202 {
203 const struct nvme_discovery_log_entry *dle;
204 const struct nvme_controller_data *cdata;
205 const nvlist_t *const *io;
206 const nvlist_t *admin, *rparams;
207 nvlist_t *nvl;
208 size_t i, num_io_queues;
209 uint32_t qsize;
210 int error;
211
212 error = nvmf_unpack_ioc_nvlist(nv, &nvl);
213 if (error != 0)
214 return (error);
215
216 if (!nvlist_exists_number(nvl, "trtype") ||
217 !nvlist_exists_nvlist(nvl, "admin") ||
218 !nvlist_exists_nvlist_array(nvl, "io") ||
219 !nvlist_exists_binary(nvl, "cdata") ||
220 !nvlist_exists_nvlist(nvl, "rparams"))
221 goto invalid;
222
223 rparams = nvlist_get_nvlist(nvl, "rparams");
224 if (!nvlist_exists_binary(rparams, "dle") ||
225 !nvlist_exists_string(rparams, "hostnqn") ||
226 !nvlist_exists_number(rparams, "num_io_queues") ||
227 !nvlist_exists_number(rparams, "io_qsize"))
228 goto invalid;
229
230 admin = nvlist_get_nvlist(nvl, "admin");
231 if (!nvmf_validate_qpair_nvlist(admin, false))
232 goto invalid;
233 if (!nvlist_get_bool(admin, "admin"))
234 goto invalid;
235
236 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
237 if (num_io_queues < 1 ||
238 num_io_queues != nvlist_get_number(rparams, "num_io_queues"))
239 goto invalid;
240 for (i = 0; i < num_io_queues; i++) {
241 if (!nvmf_validate_qpair_nvlist(io[i], false))
242 goto invalid;
243 }
244
245 /* Require all I/O queues to be the same size. */
246 qsize = nvlist_get_number(rparams, "io_qsize");
247 for (i = 0; i < num_io_queues; i++) {
248 if (nvlist_get_number(io[i], "qsize") != qsize)
249 goto invalid;
250 }
251
252 cdata = nvlist_get_binary(nvl, "cdata", &i);
253 if (i != sizeof(*cdata))
254 goto invalid;
255 dle = nvlist_get_binary(rparams, "dle", &i);
256 if (i != sizeof(*dle))
257 goto invalid;
258
259 if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0)
260 goto invalid;
261
262 *nvlp = nvl;
263 return (0);
264 invalid:
265 nvlist_destroy(nvl);
266 return (EINVAL);
267 }
268
269 static int
nvmf_probe(device_t dev)270 nvmf_probe(device_t dev)
271 {
272 const nvlist_t *nvl = device_get_ivars(dev);
273 const struct nvme_controller_data *cdata;
274
275 if (nvl == NULL)
276 return (ENXIO);
277
278 cdata = nvlist_get_binary(nvl, "cdata", NULL);
279 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
280 return (BUS_PROBE_DEFAULT);
281 }
282
283 static int
nvmf_establish_connection(struct nvmf_softc * sc,nvlist_t * nvl)284 nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
285 {
286 const nvlist_t *const *io;
287 const nvlist_t *admin;
288 uint64_t kato;
289 size_t num_io_queues;
290 enum nvmf_trtype trtype;
291 char name[16];
292
293 trtype = nvlist_get_number(nvl, "trtype");
294 admin = nvlist_get_nvlist(nvl, "admin");
295 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
296 kato = dnvlist_get_number(nvl, "kato", 0);
297
298 /* Setup the admin queue. */
299 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
300 if (sc->admin == NULL) {
301 device_printf(sc->dev, "Failed to setup admin queue\n");
302 return (ENXIO);
303 }
304
305 /* Setup I/O queues. */
306 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
307 M_WAITOK | M_ZERO);
308 sc->num_io_queues = num_io_queues;
309 for (u_int i = 0; i < sc->num_io_queues; i++) {
310 snprintf(name, sizeof(name), "I/O queue %u", i);
311 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
312 if (sc->io[i] == NULL) {
313 device_printf(sc->dev, "Failed to setup I/O queue %u\n",
314 i);
315 return (ENXIO);
316 }
317 }
318
319 /* Start KeepAlive timers. */
320 if (kato != 0) {
321 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
322 sc->cdata->ctratt) != 0;
323 sc->ka_rx_sbt = mstosbt(kato);
324 sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
325 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
326 nvmf_check_keep_alive, sc, C_HARDCLOCK);
327 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
328 nvmf_send_keep_alive, sc, C_HARDCLOCK);
329 }
330
331 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
332 sizeof(*sc->cdata));
333
334 /* Save reconnect parameters. */
335 nvlist_destroy(sc->rparams);
336 sc->rparams = nvlist_take_nvlist(nvl, "rparams");
337
338 return (0);
339 }
340
341 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
342 const struct nvme_namespace_data *, void *);
343
344 static bool
nvmf_scan_active_nslist(struct nvmf_softc * sc,struct nvme_ns_list * nslist,struct nvme_namespace_data * data,uint32_t * nsidp,nvmf_scan_active_ns_cb * cb,void * cb_arg)345 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
346 struct nvme_namespace_data *data, uint32_t *nsidp,
347 nvmf_scan_active_ns_cb *cb, void *cb_arg)
348 {
349 struct nvmf_completion_status status;
350 uint32_t nsid;
351
352 nvmf_status_init(&status);
353 nvmf_status_wait_io(&status);
354 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
355 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
356 device_printf(sc->dev,
357 "failed to send IDENTIFY active namespaces command\n");
358 return (false);
359 }
360 nvmf_wait_for_reply(&status);
361
362 if (status.cqe.status != 0) {
363 device_printf(sc->dev,
364 "IDENTIFY active namespaces failed, status %#x\n",
365 le16toh(status.cqe.status));
366 return (false);
367 }
368
369 if (status.io_error != 0) {
370 device_printf(sc->dev,
371 "IDENTIFY active namespaces failed with I/O error %d\n",
372 status.io_error);
373 return (false);
374 }
375
376 for (u_int i = 0; i < nitems(nslist->ns); i++) {
377 nsid = nslist->ns[i];
378 if (nsid == 0) {
379 *nsidp = 0;
380 return (true);
381 }
382
383 nvmf_status_init(&status);
384 nvmf_status_wait_io(&status);
385 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
386 &status, nvmf_io_complete, &status, M_WAITOK)) {
387 device_printf(sc->dev,
388 "failed to send IDENTIFY namespace %u command\n",
389 nsid);
390 return (false);
391 }
392 nvmf_wait_for_reply(&status);
393
394 if (status.cqe.status != 0) {
395 device_printf(sc->dev,
396 "IDENTIFY namespace %u failed, status %#x\n", nsid,
397 le16toh(status.cqe.status));
398 return (false);
399 }
400
401 if (status.io_error != 0) {
402 device_printf(sc->dev,
403 "IDENTIFY namespace %u failed with I/O error %d\n",
404 nsid, status.io_error);
405 return (false);
406 }
407
408 nvme_namespace_data_swapbytes(data);
409 if (!cb(sc, nsid, data, cb_arg))
410 return (false);
411 }
412
413 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
414
415 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
416 *nsidp = 0;
417 else
418 *nsidp = nsid;
419 return (true);
420 }
421
422 static bool
nvmf_scan_active_namespaces(struct nvmf_softc * sc,nvmf_scan_active_ns_cb * cb,void * cb_arg)423 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
424 void *cb_arg)
425 {
426 struct nvme_namespace_data *data;
427 struct nvme_ns_list *nslist;
428 uint32_t nsid;
429 bool retval;
430
431 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
432 data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
433
434 nsid = 0;
435 retval = true;
436 for (;;) {
437 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
438 cb_arg)) {
439 retval = false;
440 break;
441 }
442 if (nsid == 0)
443 break;
444 }
445
446 free(data, M_NVMF);
447 free(nslist, M_NVMF);
448 return (retval);
449 }
450
451 static bool
nvmf_add_ns(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data,void * arg __unused)452 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
453 const struct nvme_namespace_data *data, void *arg __unused)
454 {
455 if (sc->ns[nsid - 1] != NULL) {
456 device_printf(sc->dev,
457 "duplicate namespace %u in active namespace list\n",
458 nsid);
459 return (false);
460 }
461
462 /*
463 * As in nvme_ns_construct, a size of zero indicates an
464 * invalid namespace.
465 */
466 if (data->nsze == 0) {
467 device_printf(sc->dev,
468 "ignoring active namespace %u with zero size\n", nsid);
469 return (true);
470 }
471
472 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
473
474 nvmf_sim_rescan_ns(sc, nsid);
475 return (true);
476 }
477
478 static bool
nvmf_add_namespaces(struct nvmf_softc * sc)479 nvmf_add_namespaces(struct nvmf_softc *sc)
480 {
481 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
482 M_WAITOK | M_ZERO);
483 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
484 }
485
486 static int
nvmf_attach(device_t dev)487 nvmf_attach(device_t dev)
488 {
489 struct make_dev_args mda;
490 struct nvmf_softc *sc = device_get_softc(dev);
491 nvlist_t *nvl = device_get_ivars(dev);
492 const nvlist_t * const *io;
493 struct sysctl_oid *oid;
494 uint64_t val;
495 u_int i;
496 int error;
497
498 if (nvl == NULL)
499 return (ENXIO);
500
501 sc->dev = dev;
502 sc->trtype = nvlist_get_number(nvl, "trtype");
503 callout_init(&sc->ka_rx_timer, 1);
504 callout_init(&sc->ka_tx_timer, 1);
505 sx_init(&sc->connection_lock, "nvmf connection");
506 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
507
508 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
509 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
510 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
511 sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
512
513 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
514
515 nvmf_init_aer(sc);
516
517 error = nvmf_establish_connection(sc, nvl);
518 if (error != 0)
519 goto out;
520
521 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
522 if (error != 0) {
523 device_printf(sc->dev, "Failed to fetch CAP\n");
524 error = ENXIO;
525 goto out;
526 }
527
528 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
529 if (error != 0) {
530 device_printf(sc->dev, "Failed to fetch VS\n");
531 error = ENXIO;
532 goto out;
533 }
534 sc->vs = val;
535
536 /* Honor MDTS if it is set. */
537 sc->max_xfer_size = maxphys;
538 if (sc->cdata->mdts != 0) {
539 sc->max_xfer_size = ulmin(sc->max_xfer_size,
540 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
541 NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
542 }
543
544 io = nvlist_get_nvlist_array(nvl, "io", NULL);
545 sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
546 sc->num_io_queues;
547
548 error = nvmf_init_sim(sc);
549 if (error != 0)
550 goto out;
551
552 error = nvmf_start_aer(sc);
553 if (error != 0) {
554 nvmf_destroy_sim(sc);
555 goto out;
556 }
557
558 if (!nvmf_add_namespaces(sc)) {
559 nvmf_destroy_sim(sc);
560 goto out;
561 }
562
563 make_dev_args_init(&mda);
564 mda.mda_devsw = &nvmf_cdevsw;
565 mda.mda_uid = UID_ROOT;
566 mda.mda_gid = GID_WHEEL;
567 mda.mda_mode = 0600;
568 mda.mda_si_drv1 = sc;
569 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
570 if (error != 0) {
571 nvmf_destroy_sim(sc);
572 goto out;
573 }
574
575 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
576 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
577 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
578 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);
579
580 return (0);
581 out:
582 if (sc->ns != NULL) {
583 for (i = 0; i < sc->cdata->nn; i++) {
584 if (sc->ns[i] != NULL)
585 nvmf_destroy_ns(sc->ns[i]);
586 }
587 free(sc->ns, M_NVMF);
588 }
589
590 callout_drain(&sc->ka_tx_timer);
591 callout_drain(&sc->ka_rx_timer);
592
593 if (sc->admin != NULL)
594 nvmf_shutdown_controller(sc);
595
596 for (i = 0; i < sc->num_io_queues; i++) {
597 if (sc->io[i] != NULL)
598 nvmf_destroy_qp(sc->io[i]);
599 }
600 free(sc->io, M_NVMF);
601 if (sc->admin != NULL)
602 nvmf_destroy_qp(sc->admin);
603
604 nvmf_destroy_aer(sc);
605
606 taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
607 sx_destroy(&sc->connection_lock);
608 nvlist_destroy(sc->rparams);
609 free(sc->cdata, M_NVMF);
610 return (error);
611 }
612
613 void
nvmf_disconnect(struct nvmf_softc * sc)614 nvmf_disconnect(struct nvmf_softc *sc)
615 {
616 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
617 }
618
619 static void
nvmf_disconnect_task(void * arg,int pending __unused)620 nvmf_disconnect_task(void *arg, int pending __unused)
621 {
622 struct nvmf_softc *sc = arg;
623 u_int i;
624
625 sx_xlock(&sc->connection_lock);
626 if (sc->admin == NULL) {
627 /*
628 * Ignore transport errors if there is no active
629 * association.
630 */
631 sx_xunlock(&sc->connection_lock);
632 return;
633 }
634
635 if (sc->detaching) {
636 if (sc->admin != NULL) {
637 /*
638 * This unsticks the detach process if a
639 * transport error occurs during detach.
640 */
641 nvmf_shutdown_qp(sc->admin);
642 }
643 sx_xunlock(&sc->connection_lock);
644 return;
645 }
646
647 if (sc->cdev == NULL) {
648 /*
649 * Transport error occurred during attach (nvmf_add_namespaces).
650 * Shutdown the admin queue.
651 */
652 nvmf_shutdown_qp(sc->admin);
653 sx_xunlock(&sc->connection_lock);
654 return;
655 }
656
657 nanotime(&sc->last_disconnect);
658 callout_drain(&sc->ka_tx_timer);
659 callout_drain(&sc->ka_rx_timer);
660 sc->ka_traffic = false;
661
662 /* Quiesce namespace consumers. */
663 nvmf_disconnect_sim(sc);
664 for (i = 0; i < sc->cdata->nn; i++) {
665 if (sc->ns[i] != NULL)
666 nvmf_disconnect_ns(sc->ns[i]);
667 }
668
669 /* Shutdown the existing qpairs. */
670 for (i = 0; i < sc->num_io_queues; i++) {
671 nvmf_destroy_qp(sc->io[i]);
672 }
673 free(sc->io, M_NVMF);
674 sc->io = NULL;
675 sc->num_io_queues = 0;
676 nvmf_destroy_qp(sc->admin);
677 sc->admin = NULL;
678
679 sx_xunlock(&sc->connection_lock);
680 }
681
682 static int
nvmf_reconnect_host(struct nvmf_softc * sc,struct nvmf_ioc_nv * nv)683 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
684 {
685 const struct nvme_controller_data *cdata;
686 nvlist_t *nvl;
687 u_int i;
688 int error;
689
690 error = nvmf_copyin_handoff(nv, &nvl);
691 if (error != 0)
692 return (error);
693
694 /* XXX: Should we permit changing the transport type? */
695 if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
696 device_printf(sc->dev,
697 "transport type mismatch on reconnect\n");
698 return (EINVAL);
699 }
700
701 sx_xlock(&sc->connection_lock);
702 if (sc->admin != NULL || sc->detaching) {
703 error = EBUSY;
704 goto out;
705 }
706
707 /*
708 * Ensure this is for the same controller. Note that the
709 * controller ID can vary across associations if the remote
710 * system is using the dynamic controller model. This merely
711 * ensures the new association is connected to the same NVMe
712 * subsystem.
713 */
714 cdata = nvlist_get_binary(nvl, "cdata", NULL);
715 if (memcmp(sc->cdata->subnqn, cdata->subnqn,
716 sizeof(cdata->subnqn)) != 0) {
717 device_printf(sc->dev,
718 "controller subsystem NQN mismatch on reconnect\n");
719 error = EINVAL;
720 goto out;
721 }
722
723 /*
724 * XXX: Require same number and size of I/O queues so that
725 * max_pending_io is still correct?
726 */
727
728 error = nvmf_establish_connection(sc, nvl);
729 if (error != 0)
730 goto out;
731
732 error = nvmf_start_aer(sc);
733 if (error != 0)
734 goto out;
735
736 device_printf(sc->dev,
737 "established new association with %u I/O queues\n",
738 sc->num_io_queues);
739
740 /* Restart namespace consumers. */
741 for (i = 0; i < sc->cdata->nn; i++) {
742 if (sc->ns[i] != NULL)
743 nvmf_reconnect_ns(sc->ns[i]);
744 }
745 nvmf_reconnect_sim(sc);
746
747 nvmf_rescan_all_ns(sc);
748 out:
749 sx_xunlock(&sc->connection_lock);
750 nvlist_destroy(nvl);
751 return (error);
752 }
753
754 static void
nvmf_shutdown_pre_sync(void * arg,int howto)755 nvmf_shutdown_pre_sync(void *arg, int howto)
756 {
757 struct nvmf_softc *sc = arg;
758
759 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
760 return;
761
762 /*
763 * If this association is disconnected, abort any pending
764 * requests with an error to permit filesystems to unmount
765 * without hanging.
766 */
767 sx_xlock(&sc->connection_lock);
768 if (sc->admin != NULL || sc->detaching) {
769 sx_xunlock(&sc->connection_lock);
770 return;
771 }
772
773 for (u_int i = 0; i < sc->cdata->nn; i++) {
774 if (sc->ns[i] != NULL)
775 nvmf_shutdown_ns(sc->ns[i]);
776 }
777 nvmf_shutdown_sim(sc);
778 sx_xunlock(&sc->connection_lock);
779 }
780
781 static void
nvmf_shutdown_post_sync(void * arg,int howto)782 nvmf_shutdown_post_sync(void *arg, int howto)
783 {
784 struct nvmf_softc *sc = arg;
785
786 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
787 return;
788
789 /*
790 * If this association is connected, disconnect gracefully.
791 */
792 sx_xlock(&sc->connection_lock);
793 if (sc->admin == NULL || sc->detaching) {
794 sx_xunlock(&sc->connection_lock);
795 return;
796 }
797
798 callout_drain(&sc->ka_tx_timer);
799 callout_drain(&sc->ka_rx_timer);
800
801 nvmf_shutdown_controller(sc);
802
803 /*
804 * Quiesce consumers so that any commands submitted after this
805 * fail with an error. Notably, nda(4) calls nda_flush() from
806 * a post_sync handler that might be ordered after this one.
807 */
808 for (u_int i = 0; i < sc->cdata->nn; i++) {
809 if (sc->ns[i] != NULL)
810 nvmf_shutdown_ns(sc->ns[i]);
811 }
812 nvmf_shutdown_sim(sc);
813
814 for (u_int i = 0; i < sc->num_io_queues; i++) {
815 nvmf_destroy_qp(sc->io[i]);
816 }
817 nvmf_destroy_qp(sc->admin);
818 sc->admin = NULL;
819 sx_xunlock(&sc->connection_lock);
820 }
821
822 static int
nvmf_detach(device_t dev)823 nvmf_detach(device_t dev)
824 {
825 struct nvmf_softc *sc = device_get_softc(dev);
826 u_int i;
827
828 destroy_dev(sc->cdev);
829
830 sx_xlock(&sc->connection_lock);
831 sc->detaching = true;
832 sx_xunlock(&sc->connection_lock);
833
834 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
835 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
836
837 nvmf_destroy_sim(sc);
838 for (i = 0; i < sc->cdata->nn; i++) {
839 if (sc->ns[i] != NULL)
840 nvmf_destroy_ns(sc->ns[i]);
841 }
842 free(sc->ns, M_NVMF);
843
844 callout_drain(&sc->ka_tx_timer);
845 callout_drain(&sc->ka_rx_timer);
846
847 if (sc->admin != NULL)
848 nvmf_shutdown_controller(sc);
849
850 for (i = 0; i < sc->num_io_queues; i++) {
851 nvmf_destroy_qp(sc->io[i]);
852 }
853 free(sc->io, M_NVMF);
854
855 taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
856
857 if (sc->admin != NULL)
858 nvmf_destroy_qp(sc->admin);
859
860 nvmf_destroy_aer(sc);
861
862 sx_destroy(&sc->connection_lock);
863 nvlist_destroy(sc->rparams);
864 free(sc->cdata, M_NVMF);
865 return (0);
866 }
867
868 static void
nvmf_rescan_ns_1(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data)869 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
870 const struct nvme_namespace_data *data)
871 {
872 struct nvmf_namespace *ns;
873
874 /* XXX: Needs locking around sc->ns[]. */
875 ns = sc->ns[nsid - 1];
876 if (data->nsze == 0) {
877 /* XXX: Needs locking */
878 if (ns != NULL) {
879 nvmf_destroy_ns(ns);
880 sc->ns[nsid - 1] = NULL;
881 }
882 } else {
883 /* XXX: Needs locking */
884 if (ns == NULL) {
885 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
886 } else {
887 if (!nvmf_update_ns(ns, data)) {
888 nvmf_destroy_ns(ns);
889 sc->ns[nsid - 1] = NULL;
890 }
891 }
892 }
893
894 nvmf_sim_rescan_ns(sc, nsid);
895 }
896
897 void
nvmf_rescan_ns(struct nvmf_softc * sc,uint32_t nsid)898 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
899 {
900 struct nvmf_completion_status status;
901 struct nvme_namespace_data *data;
902
903 data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
904
905 nvmf_status_init(&status);
906 nvmf_status_wait_io(&status);
907 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
908 &status, nvmf_io_complete, &status, M_WAITOK)) {
909 device_printf(sc->dev,
910 "failed to send IDENTIFY namespace %u command\n", nsid);
911 free(data, M_NVMF);
912 return;
913 }
914 nvmf_wait_for_reply(&status);
915
916 if (status.cqe.status != 0) {
917 device_printf(sc->dev,
918 "IDENTIFY namespace %u failed, status %#x\n", nsid,
919 le16toh(status.cqe.status));
920 free(data, M_NVMF);
921 return;
922 }
923
924 if (status.io_error != 0) {
925 device_printf(sc->dev,
926 "IDENTIFY namespace %u failed with I/O error %d\n",
927 nsid, status.io_error);
928 free(data, M_NVMF);
929 return;
930 }
931
932 nvme_namespace_data_swapbytes(data);
933
934 nvmf_rescan_ns_1(sc, nsid, data);
935
936 free(data, M_NVMF);
937 }
938
939 static void
nvmf_purge_namespaces(struct nvmf_softc * sc,uint32_t first_nsid,uint32_t next_valid_nsid)940 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
941 uint32_t next_valid_nsid)
942 {
943 struct nvmf_namespace *ns;
944
945 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
946 {
947 /* XXX: Needs locking around sc->ns[]. */
948 ns = sc->ns[nsid - 1];
949 if (ns != NULL) {
950 nvmf_destroy_ns(ns);
951 sc->ns[nsid - 1] = NULL;
952
953 nvmf_sim_rescan_ns(sc, nsid);
954 }
955 }
956 }
957
958 static bool
nvmf_rescan_ns_cb(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data,void * arg)959 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
960 const struct nvme_namespace_data *data, void *arg)
961 {
962 uint32_t *last_nsid = arg;
963
964 /* Check for any gaps prior to this namespace. */
965 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
966 *last_nsid = nsid;
967
968 nvmf_rescan_ns_1(sc, nsid, data);
969 return (true);
970 }
971
972 void
nvmf_rescan_all_ns(struct nvmf_softc * sc)973 nvmf_rescan_all_ns(struct nvmf_softc *sc)
974 {
975 uint32_t last_nsid;
976
977 last_nsid = 0;
978 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
979 return;
980
981 /*
982 * Check for any namespace devices after the last active
983 * namespace.
984 */
985 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
986 }
987
988 int
nvmf_passthrough_cmd(struct nvmf_softc * sc,struct nvme_pt_command * pt,bool admin)989 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
990 bool admin)
991 {
992 struct nvmf_completion_status status;
993 struct nvme_command cmd;
994 struct memdesc mem;
995 struct nvmf_host_qpair *qp;
996 struct nvmf_request *req;
997 void *buf;
998 int error;
999
1000 if (pt->len > sc->max_xfer_size)
1001 return (EINVAL);
1002
1003 buf = NULL;
1004 if (pt->len != 0) {
1005 /*
1006 * XXX: Depending on the size we may want to pin the
1007 * user pages and use a memdesc with vm_page_t's
1008 * instead.
1009 */
1010 buf = malloc(pt->len, M_NVMF, M_WAITOK);
1011 if (pt->is_read == 0) {
1012 error = copyin(pt->buf, buf, pt->len);
1013 if (error != 0) {
1014 free(buf, M_NVMF);
1015 return (error);
1016 }
1017 } else {
1018 /* Ensure no kernel data is leaked to userland. */
1019 memset(buf, 0, pt->len);
1020 }
1021 }
1022
1023 memset(&cmd, 0, sizeof(cmd));
1024 cmd.opc = pt->cmd.opc;
1025 cmd.fuse = pt->cmd.fuse;
1026 cmd.nsid = pt->cmd.nsid;
1027 cmd.cdw10 = pt->cmd.cdw10;
1028 cmd.cdw11 = pt->cmd.cdw11;
1029 cmd.cdw12 = pt->cmd.cdw12;
1030 cmd.cdw13 = pt->cmd.cdw13;
1031 cmd.cdw14 = pt->cmd.cdw14;
1032 cmd.cdw15 = pt->cmd.cdw15;
1033
1034 sx_slock(&sc->connection_lock);
1035 if (sc->admin == NULL || sc->detaching) {
1036 device_printf(sc->dev,
1037 "failed to send passthrough command\n");
1038 error = ECONNABORTED;
1039 sx_sunlock(&sc->connection_lock);
1040 goto error;
1041 }
1042 if (admin)
1043 qp = sc->admin;
1044 else
1045 qp = nvmf_select_io_queue(sc);
1046 nvmf_status_init(&status);
1047 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
1048 sx_sunlock(&sc->connection_lock);
1049 if (req == NULL) {
1050 device_printf(sc->dev, "failed to send passthrough command\n");
1051 error = ECONNABORTED;
1052 goto error;
1053 }
1054
1055 if (pt->len != 0) {
1056 mem = memdesc_vaddr(buf, pt->len);
1057 nvmf_capsule_append_data(req->nc, &mem, pt->len,
1058 pt->is_read == 0, nvmf_io_complete, &status);
1059 nvmf_status_wait_io(&status);
1060 }
1061
1062 nvmf_submit_request(req);
1063 nvmf_wait_for_reply(&status);
1064
1065 memset(&pt->cpl, 0, sizeof(pt->cpl));
1066 pt->cpl.cdw0 = status.cqe.cdw0;
1067 pt->cpl.status = status.cqe.status;
1068
1069 error = status.io_error;
1070 if (error == 0 && pt->len != 0 && pt->is_read != 0)
1071 error = copyout(buf, pt->buf, pt->len);
1072 error:
1073 free(buf, M_NVMF);
1074 return (error);
1075 }
1076
1077 static int
nvmf_reconnect_params(struct nvmf_softc * sc,struct nvmf_ioc_nv * nv)1078 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1079 {
1080 int error;
1081
1082 sx_slock(&sc->connection_lock);
1083 error = nvmf_pack_ioc_nvlist(sc->rparams, nv);
1084 sx_sunlock(&sc->connection_lock);
1085
1086 return (error);
1087 }
1088
1089 static int
nvmf_connection_status(struct nvmf_softc * sc,struct nvmf_ioc_nv * nv)1090 nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1091 {
1092 nvlist_t *nvl, *nvl_ts;
1093 int error;
1094
1095 nvl = nvlist_create(0);
1096 nvl_ts = nvlist_create(0);
1097
1098 sx_slock(&sc->connection_lock);
1099 nvlist_add_bool(nvl, "connected", sc->admin != NULL);
1100 nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec);
1101 nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec);
1102 sx_sunlock(&sc->connection_lock);
1103 nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts);
1104
1105 error = nvmf_pack_ioc_nvlist(nvl, nv);
1106 nvlist_destroy(nvl);
1107 return (error);
1108 }
1109
1110 static int
nvmf_ioctl(struct cdev * cdev,u_long cmd,caddr_t arg,int flag,struct thread * td)1111 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1112 struct thread *td)
1113 {
1114 struct nvmf_softc *sc = cdev->si_drv1;
1115 struct nvme_get_nsid *gnsid;
1116 struct nvme_pt_command *pt;
1117 struct nvmf_ioc_nv *nv;
1118
1119 switch (cmd) {
1120 case NVME_PASSTHROUGH_CMD:
1121 pt = (struct nvme_pt_command *)arg;
1122 return (nvmf_passthrough_cmd(sc, pt, true));
1123 case NVME_GET_NSID:
1124 gnsid = (struct nvme_get_nsid *)arg;
1125 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1126 sizeof(gnsid->cdev));
1127 gnsid->nsid = 0;
1128 return (0);
1129 case NVME_GET_MAX_XFER_SIZE:
1130 *(uint64_t *)arg = sc->max_xfer_size;
1131 return (0);
1132 case NVME_GET_CONTROLLER_DATA:
1133 memcpy(arg, sc->cdata, sizeof(*sc->cdata));
1134 return (0);
1135 case NVMF_RECONNECT_PARAMS:
1136 nv = (struct nvmf_ioc_nv *)arg;
1137 return (nvmf_reconnect_params(sc, nv));
1138 case NVMF_RECONNECT_HOST:
1139 nv = (struct nvmf_ioc_nv *)arg;
1140 return (nvmf_reconnect_host(sc, nv));
1141 case NVMF_CONNECTION_STATUS:
1142 nv = (struct nvmf_ioc_nv *)arg;
1143 return (nvmf_connection_status(sc, nv));
1144 default:
1145 return (ENOTTY);
1146 }
1147 }
1148
1149 static struct cdevsw nvmf_cdevsw = {
1150 .d_version = D_VERSION,
1151 .d_ioctl = nvmf_ioctl
1152 };
1153
1154 static int
nvmf_modevent(module_t mod,int what,void * arg)1155 nvmf_modevent(module_t mod, int what, void *arg)
1156 {
1157 switch (what) {
1158 case MOD_LOAD:
1159 return (nvmf_ctl_load());
1160 case MOD_QUIESCE:
1161 return (0);
1162 case MOD_UNLOAD:
1163 nvmf_ctl_unload();
1164 destroy_dev_drain(&nvmf_cdevsw);
1165 return (0);
1166 default:
1167 return (EOPNOTSUPP);
1168 }
1169 }
1170
1171 static device_method_t nvmf_methods[] = {
1172 /* Device interface */
1173 DEVMETHOD(device_probe, nvmf_probe),
1174 DEVMETHOD(device_attach, nvmf_attach),
1175 DEVMETHOD(device_detach, nvmf_detach),
1176 DEVMETHOD_END
1177 };
1178
1179 driver_t nvme_nvmf_driver = {
1180 "nvme",
1181 nvmf_methods,
1182 sizeof(struct nvmf_softc),
1183 };
1184
1185 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1186 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1187