1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/dnv.h>
12 #include <sys/eventhandler.h>
13 #include <sys/lock.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/module.h>
18 #include <sys/mutex.h>
19 #include <sys/nv.h>
20 #include <sys/reboot.h>
21 #include <sys/sx.h>
22 #include <sys/sysctl.h>
23 #include <sys/taskqueue.h>
24 #include <dev/nvme/nvme.h>
25 #include <dev/nvmf/nvmf.h>
26 #include <dev/nvmf/nvmf_transport.h>
27 #include <dev/nvmf/host/nvmf_var.h>
28
29 static struct cdevsw nvmf_cdevsw;
30
31 bool nvmf_fail_disconnect = false;
32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
33 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
34
35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
36
37 static void nvmf_disconnect_task(void *arg, int pending);
38 static void nvmf_shutdown_pre_sync(void *arg, int howto);
39 static void nvmf_shutdown_post_sync(void *arg, int howto);
40
41 void
nvmf_complete(void * arg,const struct nvme_completion * cqe)42 nvmf_complete(void *arg, const struct nvme_completion *cqe)
43 {
44 struct nvmf_completion_status *status = arg;
45 struct mtx *mtx;
46
47 status->cqe = *cqe;
48 mtx = mtx_pool_find(mtxpool_sleep, status);
49 mtx_lock(mtx);
50 status->done = true;
51 mtx_unlock(mtx);
52 wakeup(status);
53 }
54
55 void
nvmf_io_complete(void * arg,size_t xfered,int error)56 nvmf_io_complete(void *arg, size_t xfered, int error)
57 {
58 struct nvmf_completion_status *status = arg;
59 struct mtx *mtx;
60
61 status->io_error = error;
62 mtx = mtx_pool_find(mtxpool_sleep, status);
63 mtx_lock(mtx);
64 status->io_done = true;
65 mtx_unlock(mtx);
66 wakeup(status);
67 }
68
69 void
nvmf_wait_for_reply(struct nvmf_completion_status * status)70 nvmf_wait_for_reply(struct nvmf_completion_status *status)
71 {
72 struct mtx *mtx;
73
74 mtx = mtx_pool_find(mtxpool_sleep, status);
75 mtx_lock(mtx);
76 while (!status->done || !status->io_done)
77 mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
78 mtx_unlock(mtx);
79 }
80
81 static int
nvmf_read_property(struct nvmf_softc * sc,uint32_t offset,uint8_t size,uint64_t * value)82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
83 uint64_t *value)
84 {
85 const struct nvmf_fabric_prop_get_rsp *rsp;
86 struct nvmf_completion_status status;
87
88 nvmf_status_init(&status);
89 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
90 M_WAITOK))
91 return (ECONNABORTED);
92 nvmf_wait_for_reply(&status);
93
94 if (status.cqe.status != 0) {
95 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
96 le16toh(status.cqe.status));
97 return (EIO);
98 }
99
100 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
101 if (size == 8)
102 *value = le64toh(rsp->value.u64);
103 else
104 *value = le32toh(rsp->value.u32.low);
105 return (0);
106 }
107
108 static int
nvmf_write_property(struct nvmf_softc * sc,uint32_t offset,uint8_t size,uint64_t value)109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
110 uint64_t value)
111 {
112 struct nvmf_completion_status status;
113
114 nvmf_status_init(&status);
115 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
116 M_WAITOK))
117 return (ECONNABORTED);
118 nvmf_wait_for_reply(&status);
119
120 if (status.cqe.status != 0) {
121 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
122 le16toh(status.cqe.status));
123 return (EIO);
124 }
125 return (0);
126 }
127
128 static void
nvmf_shutdown_controller(struct nvmf_softc * sc)129 nvmf_shutdown_controller(struct nvmf_softc *sc)
130 {
131 uint64_t cc;
132 int error;
133
134 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
135 if (error != 0) {
136 device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
137 return;
138 }
139
140 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
141
142 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
143 if (error != 0)
144 device_printf(sc->dev,
145 "Failed to set CC to trigger shutdown\n");
146 }
147
148 static void
nvmf_check_keep_alive(void * arg)149 nvmf_check_keep_alive(void *arg)
150 {
151 struct nvmf_softc *sc = arg;
152 int traffic;
153
154 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
155 if (traffic == 0) {
156 device_printf(sc->dev,
157 "disconnecting due to KeepAlive timeout\n");
158 nvmf_disconnect(sc);
159 return;
160 }
161
162 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
163 }
164
165 static void
nvmf_keep_alive_complete(void * arg,const struct nvme_completion * cqe)166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
167 {
168 struct nvmf_softc *sc = arg;
169
170 atomic_store_int(&sc->ka_active_rx_traffic, 1);
171 if (cqe->status != 0) {
172 device_printf(sc->dev,
173 "KeepAlive response reported status %#x\n",
174 le16toh(cqe->status));
175 }
176 }
177
178 static void
nvmf_send_keep_alive(void * arg)179 nvmf_send_keep_alive(void *arg)
180 {
181 struct nvmf_softc *sc = arg;
182 int traffic;
183
184 /*
185 * Don't bother sending a KeepAlive command if TKAS is active
186 * and another command has been sent during the interval.
187 */
188 traffic = atomic_load_int(&sc->ka_active_tx_traffic);
189 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
190 sc, M_NOWAIT))
191 device_printf(sc->dev,
192 "Failed to allocate KeepAlive command\n");
193
194 /* Clear ka_active_tx_traffic after sending the keep alive command. */
195 atomic_store_int(&sc->ka_active_tx_traffic, 0);
196
197 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
198 }
199
200 int
nvmf_copyin_handoff(const struct nvmf_ioc_nv * nv,nvlist_t ** nvlp)201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
202 {
203 const nvlist_t *const *io;
204 const nvlist_t *admin;
205 nvlist_t *nvl;
206 size_t i, num_io_queues;
207 uint32_t qsize;
208 int error;
209
210 error = nvmf_unpack_ioc_nvlist(nv, &nvl);
211 if (error != 0)
212 return (error);
213
214 if (!nvlist_exists_number(nvl, "trtype") ||
215 !nvlist_exists_nvlist(nvl, "admin") ||
216 !nvlist_exists_nvlist_array(nvl, "io") ||
217 !nvlist_exists_binary(nvl, "cdata"))
218 goto invalid;
219
220 admin = nvlist_get_nvlist(nvl, "admin");
221 if (!nvmf_validate_qpair_nvlist(admin, false))
222 goto invalid;
223 if (!nvlist_get_bool(admin, "admin"))
224 goto invalid;
225
226 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
227 if (num_io_queues < 1)
228 goto invalid;
229 for (i = 0; i < num_io_queues; i++) {
230 if (!nvmf_validate_qpair_nvlist(io[i], false))
231 goto invalid;
232 }
233
234 /* Require all I/O queues to be the same size. */
235 qsize = nvlist_get_number(io[0], "qsize");
236 for (i = 1; i < num_io_queues; i++) {
237 if (nvlist_get_number(io[i], "qsize") != qsize)
238 goto invalid;
239 }
240
241 nvlist_get_binary(nvl, "cdata", &i);
242 if (i != sizeof(struct nvme_controller_data))
243 goto invalid;
244
245 *nvlp = nvl;
246 return (0);
247 invalid:
248 nvlist_destroy(nvl);
249 return (EINVAL);
250 }
251
252 static int
nvmf_probe(device_t dev)253 nvmf_probe(device_t dev)
254 {
255 const nvlist_t *nvl = device_get_ivars(dev);
256 const struct nvme_controller_data *cdata;
257
258 if (nvl == NULL)
259 return (ENXIO);
260
261 cdata = nvlist_get_binary(nvl, "cdata", NULL);
262 device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
263 return (BUS_PROBE_DEFAULT);
264 }
265
266 static int
nvmf_establish_connection(struct nvmf_softc * sc,const nvlist_t * nvl)267 nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl)
268 {
269 const nvlist_t *const *io;
270 const nvlist_t *admin;
271 uint64_t kato;
272 size_t num_io_queues;
273 enum nvmf_trtype trtype;
274 char name[16];
275
276 trtype = nvlist_get_number(nvl, "trtype");
277 admin = nvlist_get_nvlist(nvl, "admin");
278 io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
279 kato = dnvlist_get_number(nvl, "kato", 0);
280
281 /* Setup the admin queue. */
282 sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
283 if (sc->admin == NULL) {
284 device_printf(sc->dev, "Failed to setup admin queue\n");
285 return (ENXIO);
286 }
287
288 /* Setup I/O queues. */
289 sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
290 M_WAITOK | M_ZERO);
291 sc->num_io_queues = num_io_queues;
292 for (u_int i = 0; i < sc->num_io_queues; i++) {
293 snprintf(name, sizeof(name), "I/O queue %u", i);
294 sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
295 if (sc->io[i] == NULL) {
296 device_printf(sc->dev, "Failed to setup I/O queue %u\n",
297 i + 1);
298 return (ENXIO);
299 }
300 }
301
302 /* Start KeepAlive timers. */
303 if (kato != 0) {
304 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
305 sc->cdata->ctratt) != 0;
306 sc->ka_rx_sbt = mstosbt(kato);
307 sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
308 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
309 nvmf_check_keep_alive, sc, C_HARDCLOCK);
310 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
311 nvmf_send_keep_alive, sc, C_HARDCLOCK);
312 }
313
314 memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
315 sizeof(*sc->cdata));
316
317 return (0);
318 }
319
320 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
321 const struct nvme_namespace_data *, void *);
322
323 static bool
nvmf_scan_active_nslist(struct nvmf_softc * sc,struct nvme_ns_list * nslist,struct nvme_namespace_data * data,uint32_t * nsidp,nvmf_scan_active_ns_cb * cb,void * cb_arg)324 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
325 struct nvme_namespace_data *data, uint32_t *nsidp,
326 nvmf_scan_active_ns_cb *cb, void *cb_arg)
327 {
328 struct nvmf_completion_status status;
329 uint32_t nsid;
330
331 nvmf_status_init(&status);
332 nvmf_status_wait_io(&status);
333 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
334 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
335 device_printf(sc->dev,
336 "failed to send IDENTIFY active namespaces command\n");
337 return (false);
338 }
339 nvmf_wait_for_reply(&status);
340
341 if (status.cqe.status != 0) {
342 device_printf(sc->dev,
343 "IDENTIFY active namespaces failed, status %#x\n",
344 le16toh(status.cqe.status));
345 return (false);
346 }
347
348 if (status.io_error != 0) {
349 device_printf(sc->dev,
350 "IDENTIFY active namespaces failed with I/O error %d\n",
351 status.io_error);
352 return (false);
353 }
354
355 for (u_int i = 0; i < nitems(nslist->ns); i++) {
356 nsid = nslist->ns[i];
357 if (nsid == 0) {
358 *nsidp = 0;
359 return (true);
360 }
361
362 nvmf_status_init(&status);
363 nvmf_status_wait_io(&status);
364 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
365 &status, nvmf_io_complete, &status, M_WAITOK)) {
366 device_printf(sc->dev,
367 "failed to send IDENTIFY namespace %u command\n",
368 nsid);
369 return (false);
370 }
371 nvmf_wait_for_reply(&status);
372
373 if (status.cqe.status != 0) {
374 device_printf(sc->dev,
375 "IDENTIFY namespace %u failed, status %#x\n", nsid,
376 le16toh(status.cqe.status));
377 return (false);
378 }
379
380 if (status.io_error != 0) {
381 device_printf(sc->dev,
382 "IDENTIFY namespace %u failed with I/O error %d\n",
383 nsid, status.io_error);
384 return (false);
385 }
386
387 nvme_namespace_data_swapbytes(data);
388 if (!cb(sc, nsid, data, cb_arg))
389 return (false);
390 }
391
392 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
393
394 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
395 *nsidp = 0;
396 else
397 *nsidp = nsid;
398 return (true);
399 }
400
401 static bool
nvmf_scan_active_namespaces(struct nvmf_softc * sc,nvmf_scan_active_ns_cb * cb,void * cb_arg)402 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
403 void *cb_arg)
404 {
405 struct nvme_namespace_data *data;
406 struct nvme_ns_list *nslist;
407 uint32_t nsid;
408 bool retval;
409
410 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
411 data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
412
413 nsid = 0;
414 retval = true;
415 for (;;) {
416 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
417 cb_arg)) {
418 retval = false;
419 break;
420 }
421 if (nsid == 0)
422 break;
423 }
424
425 free(data, M_NVMF);
426 free(nslist, M_NVMF);
427 return (retval);
428 }
429
430 static bool
nvmf_add_ns(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data,void * arg __unused)431 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
432 const struct nvme_namespace_data *data, void *arg __unused)
433 {
434 if (sc->ns[nsid - 1] != NULL) {
435 device_printf(sc->dev,
436 "duplicate namespace %u in active namespace list\n",
437 nsid);
438 return (false);
439 }
440
441 /*
442 * As in nvme_ns_construct, a size of zero indicates an
443 * invalid namespace.
444 */
445 if (data->nsze == 0) {
446 device_printf(sc->dev,
447 "ignoring active namespace %u with zero size\n", nsid);
448 return (true);
449 }
450
451 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
452
453 nvmf_sim_rescan_ns(sc, nsid);
454 return (true);
455 }
456
457 static bool
nvmf_add_namespaces(struct nvmf_softc * sc)458 nvmf_add_namespaces(struct nvmf_softc *sc)
459 {
460 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
461 M_WAITOK | M_ZERO);
462 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
463 }
464
465 static int
nvmf_attach(device_t dev)466 nvmf_attach(device_t dev)
467 {
468 struct make_dev_args mda;
469 struct nvmf_softc *sc = device_get_softc(dev);
470 const nvlist_t *nvl = device_get_ivars(dev);
471 const nvlist_t * const *io;
472 struct sysctl_oid *oid;
473 uint64_t val;
474 u_int i;
475 int error;
476
477 if (nvl == NULL)
478 return (ENXIO);
479
480 sc->dev = dev;
481 sc->trtype = nvlist_get_number(nvl, "trtype");
482 callout_init(&sc->ka_rx_timer, 1);
483 callout_init(&sc->ka_tx_timer, 1);
484 sx_init(&sc->connection_lock, "nvmf connection");
485 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
486
487 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
488 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
489 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
490 sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
491
492 sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
493
494 nvmf_init_aer(sc);
495
496 error = nvmf_establish_connection(sc, nvl);
497 if (error != 0)
498 goto out;
499
500 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
501 if (error != 0) {
502 device_printf(sc->dev, "Failed to fetch CAP\n");
503 error = ENXIO;
504 goto out;
505 }
506
507 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
508 if (error != 0) {
509 device_printf(sc->dev, "Failed to fetch VS\n");
510 error = ENXIO;
511 goto out;
512 }
513 sc->vs = val;
514
515 /* Honor MDTS if it is set. */
516 sc->max_xfer_size = maxphys;
517 if (sc->cdata->mdts != 0) {
518 sc->max_xfer_size = ulmin(sc->max_xfer_size,
519 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
520 NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
521 }
522
523 io = nvlist_get_nvlist_array(nvl, "io", NULL);
524 sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
525 sc->num_io_queues;
526
527 error = nvmf_init_sim(sc);
528 if (error != 0)
529 goto out;
530
531 error = nvmf_start_aer(sc);
532 if (error != 0) {
533 nvmf_destroy_sim(sc);
534 goto out;
535 }
536
537 if (!nvmf_add_namespaces(sc)) {
538 nvmf_destroy_sim(sc);
539 goto out;
540 }
541
542 make_dev_args_init(&mda);
543 mda.mda_devsw = &nvmf_cdevsw;
544 mda.mda_uid = UID_ROOT;
545 mda.mda_gid = GID_WHEEL;
546 mda.mda_mode = 0600;
547 mda.mda_si_drv1 = sc;
548 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
549 if (error != 0) {
550 nvmf_destroy_sim(sc);
551 goto out;
552 }
553
554 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
555 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
556 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
557 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);
558
559 return (0);
560 out:
561 if (sc->ns != NULL) {
562 for (i = 0; i < sc->cdata->nn; i++) {
563 if (sc->ns[i] != NULL)
564 nvmf_destroy_ns(sc->ns[i]);
565 }
566 free(sc->ns, M_NVMF);
567 }
568
569 callout_drain(&sc->ka_tx_timer);
570 callout_drain(&sc->ka_rx_timer);
571
572 if (sc->admin != NULL)
573 nvmf_shutdown_controller(sc);
574
575 for (i = 0; i < sc->num_io_queues; i++) {
576 if (sc->io[i] != NULL)
577 nvmf_destroy_qp(sc->io[i]);
578 }
579 free(sc->io, M_NVMF);
580 if (sc->admin != NULL)
581 nvmf_destroy_qp(sc->admin);
582
583 nvmf_destroy_aer(sc);
584
585 taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
586 sx_destroy(&sc->connection_lock);
587 free(sc->cdata, M_NVMF);
588 return (error);
589 }
590
591 void
nvmf_disconnect(struct nvmf_softc * sc)592 nvmf_disconnect(struct nvmf_softc *sc)
593 {
594 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
595 }
596
597 static void
nvmf_disconnect_task(void * arg,int pending __unused)598 nvmf_disconnect_task(void *arg, int pending __unused)
599 {
600 struct nvmf_softc *sc = arg;
601 u_int i;
602
603 sx_xlock(&sc->connection_lock);
604 if (sc->admin == NULL) {
605 /*
606 * Ignore transport errors if there is no active
607 * association.
608 */
609 sx_xunlock(&sc->connection_lock);
610 return;
611 }
612
613 if (sc->detaching) {
614 if (sc->admin != NULL) {
615 /*
616 * This unsticks the detach process if a
617 * transport error occurs during detach.
618 */
619 nvmf_shutdown_qp(sc->admin);
620 }
621 sx_xunlock(&sc->connection_lock);
622 return;
623 }
624
625 if (sc->cdev == NULL) {
626 /*
627 * Transport error occurred during attach (nvmf_add_namespaces).
628 * Shutdown the admin queue.
629 */
630 nvmf_shutdown_qp(sc->admin);
631 sx_xunlock(&sc->connection_lock);
632 return;
633 }
634
635 callout_drain(&sc->ka_tx_timer);
636 callout_drain(&sc->ka_rx_timer);
637 sc->ka_traffic = false;
638
639 /* Quiesce namespace consumers. */
640 nvmf_disconnect_sim(sc);
641 for (i = 0; i < sc->cdata->nn; i++) {
642 if (sc->ns[i] != NULL)
643 nvmf_disconnect_ns(sc->ns[i]);
644 }
645
646 /* Shutdown the existing qpairs. */
647 for (i = 0; i < sc->num_io_queues; i++) {
648 nvmf_destroy_qp(sc->io[i]);
649 }
650 free(sc->io, M_NVMF);
651 sc->io = NULL;
652 sc->num_io_queues = 0;
653 nvmf_destroy_qp(sc->admin);
654 sc->admin = NULL;
655
656 sx_xunlock(&sc->connection_lock);
657 }
658
659 static int
nvmf_reconnect_host(struct nvmf_softc * sc,struct nvmf_ioc_nv * nv)660 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
661 {
662 const struct nvme_controller_data *cdata;
663 nvlist_t *nvl;
664 u_int i;
665 int error;
666
667 error = nvmf_copyin_handoff(nv, &nvl);
668 if (error != 0)
669 return (error);
670
671 /* XXX: Should we permit changing the transport type? */
672 if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
673 device_printf(sc->dev,
674 "transport type mismatch on reconnect\n");
675 return (EINVAL);
676 }
677
678 sx_xlock(&sc->connection_lock);
679 if (sc->admin != NULL || sc->detaching) {
680 error = EBUSY;
681 goto out;
682 }
683
684 /*
685 * Ensure this is for the same controller. Note that the
686 * controller ID can vary across associations if the remote
687 * system is using the dynamic controller model. This merely
688 * ensures the new association is connected to the same NVMe
689 * subsystem.
690 */
691 cdata = nvlist_get_binary(nvl, "cdata", NULL);
692 if (memcmp(sc->cdata->subnqn, cdata->subnqn,
693 sizeof(cdata->subnqn)) != 0) {
694 device_printf(sc->dev,
695 "controller subsystem NQN mismatch on reconnect\n");
696 error = EINVAL;
697 goto out;
698 }
699
700 /*
701 * XXX: Require same number and size of I/O queues so that
702 * max_pending_io is still correct?
703 */
704
705 error = nvmf_establish_connection(sc, nvl);
706 if (error != 0)
707 goto out;
708
709 error = nvmf_start_aer(sc);
710 if (error != 0)
711 goto out;
712
713 device_printf(sc->dev,
714 "established new association with %u I/O queues\n",
715 sc->num_io_queues);
716
717 /* Restart namespace consumers. */
718 for (i = 0; i < sc->cdata->nn; i++) {
719 if (sc->ns[i] != NULL)
720 nvmf_reconnect_ns(sc->ns[i]);
721 }
722 nvmf_reconnect_sim(sc);
723
724 nvmf_rescan_all_ns(sc);
725 out:
726 sx_xunlock(&sc->connection_lock);
727 nvlist_destroy(nvl);
728 return (error);
729 }
730
731 static void
nvmf_shutdown_pre_sync(void * arg,int howto)732 nvmf_shutdown_pre_sync(void *arg, int howto)
733 {
734 struct nvmf_softc *sc = arg;
735
736 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
737 return;
738
739 /*
740 * If this association is disconnected, abort any pending
741 * requests with an error to permit filesystems to unmount
742 * without hanging.
743 */
744 sx_xlock(&sc->connection_lock);
745 if (sc->admin != NULL || sc->detaching) {
746 sx_xunlock(&sc->connection_lock);
747 return;
748 }
749
750 for (u_int i = 0; i < sc->cdata->nn; i++) {
751 if (sc->ns[i] != NULL)
752 nvmf_shutdown_ns(sc->ns[i]);
753 }
754 nvmf_shutdown_sim(sc);
755 sx_xunlock(&sc->connection_lock);
756 }
757
758 static void
nvmf_shutdown_post_sync(void * arg,int howto)759 nvmf_shutdown_post_sync(void *arg, int howto)
760 {
761 struct nvmf_softc *sc = arg;
762
763 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
764 return;
765
766 /*
767 * If this association is connected, disconnect gracefully.
768 */
769 sx_xlock(&sc->connection_lock);
770 if (sc->admin == NULL || sc->detaching) {
771 sx_xunlock(&sc->connection_lock);
772 return;
773 }
774
775 callout_drain(&sc->ka_tx_timer);
776 callout_drain(&sc->ka_rx_timer);
777
778 nvmf_shutdown_controller(sc);
779
780 /*
781 * Quiesce consumers so that any commands submitted after this
782 * fail with an error. Notably, nda(4) calls nda_flush() from
783 * a post_sync handler that might be ordered after this one.
784 */
785 for (u_int i = 0; i < sc->cdata->nn; i++) {
786 if (sc->ns[i] != NULL)
787 nvmf_shutdown_ns(sc->ns[i]);
788 }
789 nvmf_shutdown_sim(sc);
790
791 for (u_int i = 0; i < sc->num_io_queues; i++) {
792 nvmf_destroy_qp(sc->io[i]);
793 }
794 nvmf_destroy_qp(sc->admin);
795 sc->admin = NULL;
796 sx_xunlock(&sc->connection_lock);
797 }
798
799 static int
nvmf_detach(device_t dev)800 nvmf_detach(device_t dev)
801 {
802 struct nvmf_softc *sc = device_get_softc(dev);
803 u_int i;
804
805 destroy_dev(sc->cdev);
806
807 sx_xlock(&sc->connection_lock);
808 sc->detaching = true;
809 sx_xunlock(&sc->connection_lock);
810
811 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
812 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
813
814 nvmf_destroy_sim(sc);
815 for (i = 0; i < sc->cdata->nn; i++) {
816 if (sc->ns[i] != NULL)
817 nvmf_destroy_ns(sc->ns[i]);
818 }
819 free(sc->ns, M_NVMF);
820
821 callout_drain(&sc->ka_tx_timer);
822 callout_drain(&sc->ka_rx_timer);
823
824 if (sc->admin != NULL)
825 nvmf_shutdown_controller(sc);
826
827 for (i = 0; i < sc->num_io_queues; i++) {
828 nvmf_destroy_qp(sc->io[i]);
829 }
830 free(sc->io, M_NVMF);
831
832 taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
833
834 if (sc->admin != NULL)
835 nvmf_destroy_qp(sc->admin);
836
837 nvmf_destroy_aer(sc);
838
839 sx_destroy(&sc->connection_lock);
840 free(sc->cdata, M_NVMF);
841 return (0);
842 }
843
844 static void
nvmf_rescan_ns_1(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data)845 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
846 const struct nvme_namespace_data *data)
847 {
848 struct nvmf_namespace *ns;
849
850 /* XXX: Needs locking around sc->ns[]. */
851 ns = sc->ns[nsid - 1];
852 if (data->nsze == 0) {
853 /* XXX: Needs locking */
854 if (ns != NULL) {
855 nvmf_destroy_ns(ns);
856 sc->ns[nsid - 1] = NULL;
857 }
858 } else {
859 /* XXX: Needs locking */
860 if (ns == NULL) {
861 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
862 } else {
863 if (!nvmf_update_ns(ns, data)) {
864 nvmf_destroy_ns(ns);
865 sc->ns[nsid - 1] = NULL;
866 }
867 }
868 }
869
870 nvmf_sim_rescan_ns(sc, nsid);
871 }
872
873 void
nvmf_rescan_ns(struct nvmf_softc * sc,uint32_t nsid)874 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
875 {
876 struct nvmf_completion_status status;
877 struct nvme_namespace_data *data;
878
879 data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
880
881 nvmf_status_init(&status);
882 nvmf_status_wait_io(&status);
883 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
884 &status, nvmf_io_complete, &status, M_WAITOK)) {
885 device_printf(sc->dev,
886 "failed to send IDENTIFY namespace %u command\n", nsid);
887 free(data, M_NVMF);
888 return;
889 }
890 nvmf_wait_for_reply(&status);
891
892 if (status.cqe.status != 0) {
893 device_printf(sc->dev,
894 "IDENTIFY namespace %u failed, status %#x\n", nsid,
895 le16toh(status.cqe.status));
896 free(data, M_NVMF);
897 return;
898 }
899
900 if (status.io_error != 0) {
901 device_printf(sc->dev,
902 "IDENTIFY namespace %u failed with I/O error %d\n",
903 nsid, status.io_error);
904 free(data, M_NVMF);
905 return;
906 }
907
908 nvme_namespace_data_swapbytes(data);
909
910 nvmf_rescan_ns_1(sc, nsid, data);
911
912 free(data, M_NVMF);
913 }
914
915 static void
nvmf_purge_namespaces(struct nvmf_softc * sc,uint32_t first_nsid,uint32_t next_valid_nsid)916 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
917 uint32_t next_valid_nsid)
918 {
919 struct nvmf_namespace *ns;
920
921 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
922 {
923 /* XXX: Needs locking around sc->ns[]. */
924 ns = sc->ns[nsid - 1];
925 if (ns != NULL) {
926 nvmf_destroy_ns(ns);
927 sc->ns[nsid - 1] = NULL;
928
929 nvmf_sim_rescan_ns(sc, nsid);
930 }
931 }
932 }
933
934 static bool
nvmf_rescan_ns_cb(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data,void * arg)935 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
936 const struct nvme_namespace_data *data, void *arg)
937 {
938 uint32_t *last_nsid = arg;
939
940 /* Check for any gaps prior to this namespace. */
941 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
942 *last_nsid = nsid;
943
944 nvmf_rescan_ns_1(sc, nsid, data);
945 return (true);
946 }
947
948 void
nvmf_rescan_all_ns(struct nvmf_softc * sc)949 nvmf_rescan_all_ns(struct nvmf_softc *sc)
950 {
951 uint32_t last_nsid;
952
953 last_nsid = 0;
954 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
955 return;
956
957 /*
958 * Check for any namespace devices after the last active
959 * namespace.
960 */
961 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
962 }
963
964 int
nvmf_passthrough_cmd(struct nvmf_softc * sc,struct nvme_pt_command * pt,bool admin)965 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
966 bool admin)
967 {
968 struct nvmf_completion_status status;
969 struct nvme_command cmd;
970 struct memdesc mem;
971 struct nvmf_host_qpair *qp;
972 struct nvmf_request *req;
973 void *buf;
974 int error;
975
976 if (pt->len > sc->max_xfer_size)
977 return (EINVAL);
978
979 buf = NULL;
980 if (pt->len != 0) {
981 /*
982 * XXX: Depending on the size we may want to pin the
983 * user pages and use a memdesc with vm_page_t's
984 * instead.
985 */
986 buf = malloc(pt->len, M_NVMF, M_WAITOK);
987 if (pt->is_read == 0) {
988 error = copyin(pt->buf, buf, pt->len);
989 if (error != 0) {
990 free(buf, M_NVMF);
991 return (error);
992 }
993 } else {
994 /* Ensure no kernel data is leaked to userland. */
995 memset(buf, 0, pt->len);
996 }
997 }
998
999 memset(&cmd, 0, sizeof(cmd));
1000 cmd.opc = pt->cmd.opc;
1001 cmd.fuse = pt->cmd.fuse;
1002 cmd.nsid = pt->cmd.nsid;
1003 cmd.cdw10 = pt->cmd.cdw10;
1004 cmd.cdw11 = pt->cmd.cdw11;
1005 cmd.cdw12 = pt->cmd.cdw12;
1006 cmd.cdw13 = pt->cmd.cdw13;
1007 cmd.cdw14 = pt->cmd.cdw14;
1008 cmd.cdw15 = pt->cmd.cdw15;
1009
1010 sx_slock(&sc->connection_lock);
1011 if (sc->admin == NULL || sc->detaching) {
1012 device_printf(sc->dev,
1013 "failed to send passthrough command\n");
1014 error = ECONNABORTED;
1015 sx_sunlock(&sc->connection_lock);
1016 goto error;
1017 }
1018 if (admin)
1019 qp = sc->admin;
1020 else
1021 qp = nvmf_select_io_queue(sc);
1022 nvmf_status_init(&status);
1023 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
1024 sx_sunlock(&sc->connection_lock);
1025 if (req == NULL) {
1026 device_printf(sc->dev, "failed to send passthrough command\n");
1027 error = ECONNABORTED;
1028 goto error;
1029 }
1030
1031 if (pt->len != 0) {
1032 mem = memdesc_vaddr(buf, pt->len);
1033 nvmf_capsule_append_data(req->nc, &mem, pt->len,
1034 pt->is_read == 0, nvmf_io_complete, &status);
1035 nvmf_status_wait_io(&status);
1036 }
1037
1038 nvmf_submit_request(req);
1039 nvmf_wait_for_reply(&status);
1040
1041 memset(&pt->cpl, 0, sizeof(pt->cpl));
1042 pt->cpl.cdw0 = status.cqe.cdw0;
1043 pt->cpl.status = status.cqe.status;
1044
1045 error = status.io_error;
1046 if (error == 0 && pt->len != 0 && pt->is_read != 0)
1047 error = copyout(buf, pt->buf, pt->len);
1048 error:
1049 free(buf, M_NVMF);
1050 return (error);
1051 }
1052
1053 static int
nvmf_reconnect_params(struct nvmf_softc * sc,struct nvmf_ioc_nv * nv)1054 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1055 {
1056 nvlist_t *nvl;
1057 int error;
1058
1059 nvl = nvlist_create(0);
1060
1061 sx_slock(&sc->connection_lock);
1062 if ((sc->cdata->fcatt & 1) == 0)
1063 nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC);
1064 else
1065 nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id);
1066 nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn);
1067 sx_sunlock(&sc->connection_lock);
1068
1069 error = nvmf_pack_ioc_nvlist(nvl, nv);
1070 nvlist_destroy(nvl);
1071 return (error);
1072 }
1073
1074 static int
nvmf_ioctl(struct cdev * cdev,u_long cmd,caddr_t arg,int flag,struct thread * td)1075 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1076 struct thread *td)
1077 {
1078 struct nvmf_softc *sc = cdev->si_drv1;
1079 struct nvme_get_nsid *gnsid;
1080 struct nvme_pt_command *pt;
1081 struct nvmf_ioc_nv *nv;
1082
1083 switch (cmd) {
1084 case NVME_PASSTHROUGH_CMD:
1085 pt = (struct nvme_pt_command *)arg;
1086 return (nvmf_passthrough_cmd(sc, pt, true));
1087 case NVME_GET_NSID:
1088 gnsid = (struct nvme_get_nsid *)arg;
1089 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1090 sizeof(gnsid->cdev));
1091 gnsid->nsid = 0;
1092 return (0);
1093 case NVME_GET_MAX_XFER_SIZE:
1094 *(uint64_t *)arg = sc->max_xfer_size;
1095 return (0);
1096 case NVMF_RECONNECT_PARAMS:
1097 nv = (struct nvmf_ioc_nv *)arg;
1098 return (nvmf_reconnect_params(sc, nv));
1099 case NVMF_RECONNECT_HOST:
1100 nv = (struct nvmf_ioc_nv *)arg;
1101 return (nvmf_reconnect_host(sc, nv));
1102 default:
1103 return (ENOTTY);
1104 }
1105 }
1106
1107 static struct cdevsw nvmf_cdevsw = {
1108 .d_version = D_VERSION,
1109 .d_ioctl = nvmf_ioctl
1110 };
1111
1112 static int
nvmf_modevent(module_t mod,int what,void * arg)1113 nvmf_modevent(module_t mod, int what, void *arg)
1114 {
1115 switch (what) {
1116 case MOD_LOAD:
1117 return (nvmf_ctl_load());
1118 case MOD_QUIESCE:
1119 return (0);
1120 case MOD_UNLOAD:
1121 nvmf_ctl_unload();
1122 destroy_dev_drain(&nvmf_cdevsw);
1123 return (0);
1124 default:
1125 return (EOPNOTSUPP);
1126 }
1127 }
1128
1129 static device_method_t nvmf_methods[] = {
1130 /* Device interface */
1131 DEVMETHOD(device_probe, nvmf_probe),
1132 DEVMETHOD(device_attach, nvmf_attach),
1133 DEVMETHOD(device_detach, nvmf_detach),
1134 DEVMETHOD_END
1135 };
1136
1137 driver_t nvme_nvmf_driver = {
1138 "nvme",
1139 nvmf_methods,
1140 sizeof(struct nvmf_softc),
1141 };
1142
1143 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1144 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1145