1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/eventhandler.h>
12 #include <sys/lock.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/memdesc.h>
16 #include <sys/module.h>
17 #include <sys/mutex.h>
18 #include <sys/reboot.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/taskqueue.h>
22 #include <dev/nvme/nvme.h>
23 #include <dev/nvmf/nvmf.h>
24 #include <dev/nvmf/nvmf_transport.h>
25 #include <dev/nvmf/host/nvmf_var.h>
26
27 static struct cdevsw nvmf_cdevsw;
28
29 bool nvmf_fail_disconnect = false;
30 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
31 &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
32
33 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
34
35 static void nvmf_disconnect_task(void *arg, int pending);
36 static void nvmf_shutdown_pre_sync(void *arg, int howto);
37 static void nvmf_shutdown_post_sync(void *arg, int howto);
38
39 void
nvmf_complete(void * arg,const struct nvme_completion * cqe)40 nvmf_complete(void *arg, const struct nvme_completion *cqe)
41 {
42 struct nvmf_completion_status *status = arg;
43 struct mtx *mtx;
44
45 status->cqe = *cqe;
46 mtx = mtx_pool_find(mtxpool_sleep, status);
47 mtx_lock(mtx);
48 status->done = true;
49 mtx_unlock(mtx);
50 wakeup(status);
51 }
52
53 void
nvmf_io_complete(void * arg,size_t xfered,int error)54 nvmf_io_complete(void *arg, size_t xfered, int error)
55 {
56 struct nvmf_completion_status *status = arg;
57 struct mtx *mtx;
58
59 status->io_error = error;
60 mtx = mtx_pool_find(mtxpool_sleep, status);
61 mtx_lock(mtx);
62 status->io_done = true;
63 mtx_unlock(mtx);
64 wakeup(status);
65 }
66
67 void
nvmf_wait_for_reply(struct nvmf_completion_status * status)68 nvmf_wait_for_reply(struct nvmf_completion_status *status)
69 {
70 struct mtx *mtx;
71
72 mtx = mtx_pool_find(mtxpool_sleep, status);
73 mtx_lock(mtx);
74 while (!status->done || !status->io_done)
75 mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
76 mtx_unlock(mtx);
77 }
78
79 static int
nvmf_read_property(struct nvmf_softc * sc,uint32_t offset,uint8_t size,uint64_t * value)80 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
81 uint64_t *value)
82 {
83 const struct nvmf_fabric_prop_get_rsp *rsp;
84 struct nvmf_completion_status status;
85
86 nvmf_status_init(&status);
87 if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
88 M_WAITOK))
89 return (ECONNABORTED);
90 nvmf_wait_for_reply(&status);
91
92 if (status.cqe.status != 0) {
93 device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
94 le16toh(status.cqe.status));
95 return (EIO);
96 }
97
98 rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
99 if (size == 8)
100 *value = le64toh(rsp->value.u64);
101 else
102 *value = le32toh(rsp->value.u32.low);
103 return (0);
104 }
105
106 static int
nvmf_write_property(struct nvmf_softc * sc,uint32_t offset,uint8_t size,uint64_t value)107 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
108 uint64_t value)
109 {
110 struct nvmf_completion_status status;
111
112 nvmf_status_init(&status);
113 if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
114 M_WAITOK))
115 return (ECONNABORTED);
116 nvmf_wait_for_reply(&status);
117
118 if (status.cqe.status != 0) {
119 device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
120 le16toh(status.cqe.status));
121 return (EIO);
122 }
123 return (0);
124 }
125
126 static void
nvmf_shutdown_controller(struct nvmf_softc * sc)127 nvmf_shutdown_controller(struct nvmf_softc *sc)
128 {
129 uint64_t cc;
130 int error;
131
132 error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
133 if (error != 0) {
134 device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
135 return;
136 }
137
138 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
139
140 error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
141 if (error != 0)
142 device_printf(sc->dev,
143 "Failed to set CC to trigger shutdown\n");
144 }
145
146 static void
nvmf_check_keep_alive(void * arg)147 nvmf_check_keep_alive(void *arg)
148 {
149 struct nvmf_softc *sc = arg;
150 int traffic;
151
152 traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
153 if (traffic == 0) {
154 device_printf(sc->dev,
155 "disconnecting due to KeepAlive timeout\n");
156 nvmf_disconnect(sc);
157 return;
158 }
159
160 callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
161 }
162
163 static void
nvmf_keep_alive_complete(void * arg,const struct nvme_completion * cqe)164 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
165 {
166 struct nvmf_softc *sc = arg;
167
168 atomic_store_int(&sc->ka_active_rx_traffic, 1);
169 if (cqe->status != 0) {
170 device_printf(sc->dev,
171 "KeepAlive response reported status %#x\n",
172 le16toh(cqe->status));
173 }
174 }
175
176 static void
nvmf_send_keep_alive(void * arg)177 nvmf_send_keep_alive(void *arg)
178 {
179 struct nvmf_softc *sc = arg;
180 int traffic;
181
182 /*
183 * Don't bother sending a KeepAlive command if TKAS is active
184 * and another command has been sent during the interval.
185 */
186 traffic = atomic_load_int(&sc->ka_active_tx_traffic);
187 if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
188 sc, M_NOWAIT))
189 device_printf(sc->dev,
190 "Failed to allocate KeepAlive command\n");
191
192 /* Clear ka_active_tx_traffic after sending the keep alive command. */
193 atomic_store_int(&sc->ka_active_tx_traffic, 0);
194
195 callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
196 }
197
198 int
nvmf_init_ivars(struct nvmf_ivars * ivars,struct nvmf_handoff_host * hh)199 nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
200 {
201 size_t len;
202 u_int i;
203 int error;
204
205 memset(ivars, 0, sizeof(*ivars));
206
207 if (!hh->admin.admin || hh->num_io_queues < 1)
208 return (EINVAL);
209
210 ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
211 error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
212 if (error != 0)
213 goto out;
214 nvme_controller_data_swapbytes(ivars->cdata);
215
216 len = hh->num_io_queues * sizeof(*ivars->io_params);
217 ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
218 error = copyin(hh->io, ivars->io_params, len);
219 if (error != 0)
220 goto out;
221 for (i = 0; i < hh->num_io_queues; i++) {
222 if (ivars->io_params[i].admin) {
223 error = EINVAL;
224 goto out;
225 }
226
227 /* Require all I/O queues to be the same size. */
228 if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
229 error = EINVAL;
230 goto out;
231 }
232 }
233
234 ivars->hh = hh;
235 return (0);
236
237 out:
238 free(ivars->io_params, M_NVMF);
239 free(ivars->cdata, M_NVMF);
240 return (error);
241 }
242
243 void
nvmf_free_ivars(struct nvmf_ivars * ivars)244 nvmf_free_ivars(struct nvmf_ivars *ivars)
245 {
246 free(ivars->io_params, M_NVMF);
247 free(ivars->cdata, M_NVMF);
248 }
249
250 static int
nvmf_probe(device_t dev)251 nvmf_probe(device_t dev)
252 {
253 struct nvmf_ivars *ivars = device_get_ivars(dev);
254
255 if (ivars == NULL)
256 return (ENXIO);
257
258 device_set_descf(dev, "Fabrics: %.256s", ivars->cdata->subnqn);
259 return (BUS_PROBE_DEFAULT);
260 }
261
262 static int
nvmf_establish_connection(struct nvmf_softc * sc,struct nvmf_ivars * ivars)263 nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
264 {
265 char name[16];
266
267 /* Setup the admin queue. */
268 sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
269 "admin queue", 0);
270 if (sc->admin == NULL) {
271 device_printf(sc->dev, "Failed to setup admin queue\n");
272 return (ENXIO);
273 }
274
275 /* Setup I/O queues. */
276 sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
277 M_WAITOK | M_ZERO);
278 sc->num_io_queues = ivars->hh->num_io_queues;
279 for (u_int i = 0; i < sc->num_io_queues; i++) {
280 snprintf(name, sizeof(name), "I/O queue %u", i);
281 sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
282 &ivars->io_params[i], name, i);
283 if (sc->io[i] == NULL) {
284 device_printf(sc->dev, "Failed to setup I/O queue %u\n",
285 i + 1);
286 return (ENXIO);
287 }
288 }
289
290 /* Start KeepAlive timers. */
291 if (ivars->hh->kato != 0) {
292 sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
293 sc->cdata->ctratt) != 0;
294 sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
295 sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
296 callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
297 nvmf_check_keep_alive, sc, C_HARDCLOCK);
298 callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
299 nvmf_send_keep_alive, sc, C_HARDCLOCK);
300 }
301
302 return (0);
303 }
304
305 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
306 const struct nvme_namespace_data *, void *);
307
308 static bool
nvmf_scan_active_nslist(struct nvmf_softc * sc,struct nvme_ns_list * nslist,struct nvme_namespace_data * data,uint32_t * nsidp,nvmf_scan_active_ns_cb * cb,void * cb_arg)309 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
310 struct nvme_namespace_data *data, uint32_t *nsidp,
311 nvmf_scan_active_ns_cb *cb, void *cb_arg)
312 {
313 struct nvmf_completion_status status;
314 uint32_t nsid;
315
316 nvmf_status_init(&status);
317 nvmf_status_wait_io(&status);
318 if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
319 nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
320 device_printf(sc->dev,
321 "failed to send IDENTIFY active namespaces command\n");
322 return (false);
323 }
324 nvmf_wait_for_reply(&status);
325
326 if (status.cqe.status != 0) {
327 device_printf(sc->dev,
328 "IDENTIFY active namespaces failed, status %#x\n",
329 le16toh(status.cqe.status));
330 return (false);
331 }
332
333 if (status.io_error != 0) {
334 device_printf(sc->dev,
335 "IDENTIFY active namespaces failed with I/O error %d\n",
336 status.io_error);
337 return (false);
338 }
339
340 for (u_int i = 0; i < nitems(nslist->ns); i++) {
341 nsid = nslist->ns[i];
342 if (nsid == 0) {
343 *nsidp = 0;
344 return (true);
345 }
346
347 nvmf_status_init(&status);
348 nvmf_status_wait_io(&status);
349 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
350 &status, nvmf_io_complete, &status, M_WAITOK)) {
351 device_printf(sc->dev,
352 "failed to send IDENTIFY namespace %u command\n",
353 nsid);
354 return (false);
355 }
356 nvmf_wait_for_reply(&status);
357
358 if (status.cqe.status != 0) {
359 device_printf(sc->dev,
360 "IDENTIFY namespace %u failed, status %#x\n", nsid,
361 le16toh(status.cqe.status));
362 return (false);
363 }
364
365 if (status.io_error != 0) {
366 device_printf(sc->dev,
367 "IDENTIFY namespace %u failed with I/O error %d\n",
368 nsid, status.io_error);
369 return (false);
370 }
371
372 nvme_namespace_data_swapbytes(data);
373 if (!cb(sc, nsid, data, cb_arg))
374 return (false);
375 }
376
377 MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
378
379 if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
380 *nsidp = 0;
381 else
382 *nsidp = nsid;
383 return (true);
384 }
385
386 static bool
nvmf_scan_active_namespaces(struct nvmf_softc * sc,nvmf_scan_active_ns_cb * cb,void * cb_arg)387 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
388 void *cb_arg)
389 {
390 struct nvme_namespace_data *data;
391 struct nvme_ns_list *nslist;
392 uint32_t nsid;
393 bool retval;
394
395 nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
396 data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
397
398 nsid = 0;
399 retval = true;
400 for (;;) {
401 if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
402 cb_arg)) {
403 retval = false;
404 break;
405 }
406 if (nsid == 0)
407 break;
408 }
409
410 free(data, M_NVMF);
411 free(nslist, M_NVMF);
412 return (retval);
413 }
414
415 static bool
nvmf_add_ns(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data,void * arg __unused)416 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
417 const struct nvme_namespace_data *data, void *arg __unused)
418 {
419 if (sc->ns[nsid - 1] != NULL) {
420 device_printf(sc->dev,
421 "duplicate namespace %u in active namespace list\n",
422 nsid);
423 return (false);
424 }
425
426 /*
427 * As in nvme_ns_construct, a size of zero indicates an
428 * invalid namespace.
429 */
430 if (data->nsze == 0) {
431 device_printf(sc->dev,
432 "ignoring active namespace %u with zero size\n", nsid);
433 return (true);
434 }
435
436 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
437
438 nvmf_sim_rescan_ns(sc, nsid);
439 return (true);
440 }
441
442 static bool
nvmf_add_namespaces(struct nvmf_softc * sc)443 nvmf_add_namespaces(struct nvmf_softc *sc)
444 {
445 sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
446 M_WAITOK | M_ZERO);
447 return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
448 }
449
450 static int
nvmf_attach(device_t dev)451 nvmf_attach(device_t dev)
452 {
453 struct make_dev_args mda;
454 struct nvmf_softc *sc = device_get_softc(dev);
455 struct nvmf_ivars *ivars = device_get_ivars(dev);
456 struct sysctl_oid *oid;
457 uint64_t val;
458 u_int i;
459 int error;
460
461 if (ivars == NULL)
462 return (ENXIO);
463
464 sc->dev = dev;
465 sc->trtype = ivars->hh->trtype;
466 callout_init(&sc->ka_rx_timer, 1);
467 callout_init(&sc->ka_tx_timer, 1);
468 sx_init(&sc->connection_lock, "nvmf connection");
469 TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
470
471 oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
472 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
473 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
474 sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
475
476 /* Claim the cdata pointer from ivars. */
477 sc->cdata = ivars->cdata;
478 ivars->cdata = NULL;
479
480 nvmf_init_aer(sc);
481
482 error = nvmf_establish_connection(sc, ivars);
483 if (error != 0)
484 goto out;
485
486 error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
487 if (error != 0) {
488 device_printf(sc->dev, "Failed to fetch CAP\n");
489 error = ENXIO;
490 goto out;
491 }
492
493 error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
494 if (error != 0) {
495 device_printf(sc->dev, "Failed to fetch VS\n");
496 error = ENXIO;
497 goto out;
498 }
499 sc->vs = val;
500
501 /* Honor MDTS if it is set. */
502 sc->max_xfer_size = maxphys;
503 if (sc->cdata->mdts != 0) {
504 sc->max_xfer_size = ulmin(sc->max_xfer_size,
505 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
506 NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
507 }
508
509 sc->max_pending_io = ivars->io_params[0].qsize * sc->num_io_queues;
510
511 error = nvmf_init_sim(sc);
512 if (error != 0)
513 goto out;
514
515 error = nvmf_start_aer(sc);
516 if (error != 0) {
517 nvmf_destroy_sim(sc);
518 goto out;
519 }
520
521 if (!nvmf_add_namespaces(sc)) {
522 nvmf_destroy_sim(sc);
523 goto out;
524 }
525
526 make_dev_args_init(&mda);
527 mda.mda_devsw = &nvmf_cdevsw;
528 mda.mda_uid = UID_ROOT;
529 mda.mda_gid = GID_WHEEL;
530 mda.mda_mode = 0600;
531 mda.mda_si_drv1 = sc;
532 error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
533 if (error != 0) {
534 nvmf_destroy_sim(sc);
535 goto out;
536 }
537
538 sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
539 nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
540 sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
541 nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST);
542
543 return (0);
544 out:
545 if (sc->ns != NULL) {
546 for (i = 0; i < sc->cdata->nn; i++) {
547 if (sc->ns[i] != NULL)
548 nvmf_destroy_ns(sc->ns[i]);
549 }
550 free(sc->ns, M_NVMF);
551 }
552
553 callout_drain(&sc->ka_tx_timer);
554 callout_drain(&sc->ka_rx_timer);
555
556 if (sc->admin != NULL)
557 nvmf_shutdown_controller(sc);
558
559 for (i = 0; i < sc->num_io_queues; i++) {
560 if (sc->io[i] != NULL)
561 nvmf_destroy_qp(sc->io[i]);
562 }
563 free(sc->io, M_NVMF);
564 if (sc->admin != NULL)
565 nvmf_destroy_qp(sc->admin);
566
567 nvmf_destroy_aer(sc);
568
569 taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
570 sx_destroy(&sc->connection_lock);
571 free(sc->cdata, M_NVMF);
572 return (error);
573 }
574
575 void
nvmf_disconnect(struct nvmf_softc * sc)576 nvmf_disconnect(struct nvmf_softc *sc)
577 {
578 taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
579 }
580
581 static void
nvmf_disconnect_task(void * arg,int pending __unused)582 nvmf_disconnect_task(void *arg, int pending __unused)
583 {
584 struct nvmf_softc *sc = arg;
585 u_int i;
586
587 sx_xlock(&sc->connection_lock);
588 if (sc->admin == NULL) {
589 /*
590 * Ignore transport errors if there is no active
591 * association.
592 */
593 sx_xunlock(&sc->connection_lock);
594 return;
595 }
596
597 if (sc->detaching) {
598 if (sc->admin != NULL) {
599 /*
600 * This unsticks the detach process if a
601 * transport error occurs during detach.
602 */
603 nvmf_shutdown_qp(sc->admin);
604 }
605 sx_xunlock(&sc->connection_lock);
606 return;
607 }
608
609 if (sc->cdev == NULL) {
610 /*
611 * Transport error occurred during attach (nvmf_add_namespaces).
612 * Shutdown the admin queue.
613 */
614 nvmf_shutdown_qp(sc->admin);
615 sx_xunlock(&sc->connection_lock);
616 return;
617 }
618
619 callout_drain(&sc->ka_tx_timer);
620 callout_drain(&sc->ka_rx_timer);
621 sc->ka_traffic = false;
622
623 /* Quiesce namespace consumers. */
624 nvmf_disconnect_sim(sc);
625 for (i = 0; i < sc->cdata->nn; i++) {
626 if (sc->ns[i] != NULL)
627 nvmf_disconnect_ns(sc->ns[i]);
628 }
629
630 /* Shutdown the existing qpairs. */
631 for (i = 0; i < sc->num_io_queues; i++) {
632 nvmf_destroy_qp(sc->io[i]);
633 }
634 free(sc->io, M_NVMF);
635 sc->io = NULL;
636 sc->num_io_queues = 0;
637 nvmf_destroy_qp(sc->admin);
638 sc->admin = NULL;
639
640 sx_xunlock(&sc->connection_lock);
641 }
642
643 static int
nvmf_reconnect_host(struct nvmf_softc * sc,struct nvmf_handoff_host * hh)644 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
645 {
646 struct nvmf_ivars ivars;
647 u_int i;
648 int error;
649
650 /* XXX: Should we permit changing the transport type? */
651 if (sc->trtype != hh->trtype) {
652 device_printf(sc->dev,
653 "transport type mismatch on reconnect\n");
654 return (EINVAL);
655 }
656
657 error = nvmf_init_ivars(&ivars, hh);
658 if (error != 0)
659 return (error);
660
661 sx_xlock(&sc->connection_lock);
662 if (sc->admin != NULL || sc->detaching) {
663 error = EBUSY;
664 goto out;
665 }
666
667 /*
668 * Ensure this is for the same controller. Note that the
669 * controller ID can vary across associations if the remote
670 * system is using the dynamic controller model. This merely
671 * ensures the new association is connected to the same NVMe
672 * subsystem.
673 */
674 if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
675 sizeof(ivars.cdata->subnqn)) != 0) {
676 device_printf(sc->dev,
677 "controller subsystem NQN mismatch on reconnect\n");
678 error = EINVAL;
679 goto out;
680 }
681
682 /*
683 * XXX: Require same number and size of I/O queues so that
684 * max_pending_io is still correct?
685 */
686
687 error = nvmf_establish_connection(sc, &ivars);
688 if (error != 0)
689 goto out;
690
691 error = nvmf_start_aer(sc);
692 if (error != 0)
693 goto out;
694
695 device_printf(sc->dev,
696 "established new association with %u I/O queues\n",
697 sc->num_io_queues);
698
699 /* Restart namespace consumers. */
700 for (i = 0; i < sc->cdata->nn; i++) {
701 if (sc->ns[i] != NULL)
702 nvmf_reconnect_ns(sc->ns[i]);
703 }
704 nvmf_reconnect_sim(sc);
705
706 nvmf_rescan_all_ns(sc);
707 out:
708 sx_xunlock(&sc->connection_lock);
709 nvmf_free_ivars(&ivars);
710 return (error);
711 }
712
713 static void
nvmf_shutdown_pre_sync(void * arg,int howto)714 nvmf_shutdown_pre_sync(void *arg, int howto)
715 {
716 struct nvmf_softc *sc = arg;
717
718 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
719 return;
720
721 /*
722 * If this association is disconnected, abort any pending
723 * requests with an error to permit filesystems to unmount
724 * without hanging.
725 */
726 sx_xlock(&sc->connection_lock);
727 if (sc->admin != NULL || sc->detaching) {
728 sx_xunlock(&sc->connection_lock);
729 return;
730 }
731
732 for (u_int i = 0; i < sc->cdata->nn; i++) {
733 if (sc->ns[i] != NULL)
734 nvmf_shutdown_ns(sc->ns[i]);
735 }
736 nvmf_shutdown_sim(sc);
737 sx_xunlock(&sc->connection_lock);
738 }
739
740 static void
nvmf_shutdown_post_sync(void * arg,int howto)741 nvmf_shutdown_post_sync(void *arg, int howto)
742 {
743 struct nvmf_softc *sc = arg;
744
745 if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
746 return;
747
748 /*
749 * If this association is connected, disconnect gracefully.
750 */
751 sx_xlock(&sc->connection_lock);
752 if (sc->admin == NULL || sc->detaching) {
753 sx_xunlock(&sc->connection_lock);
754 return;
755 }
756
757 callout_drain(&sc->ka_tx_timer);
758 callout_drain(&sc->ka_rx_timer);
759
760 nvmf_shutdown_controller(sc);
761 for (u_int i = 0; i < sc->num_io_queues; i++) {
762 nvmf_destroy_qp(sc->io[i]);
763 }
764 nvmf_destroy_qp(sc->admin);
765 sc->admin = NULL;
766 sx_xunlock(&sc->connection_lock);
767 }
768
769 static int
nvmf_detach(device_t dev)770 nvmf_detach(device_t dev)
771 {
772 struct nvmf_softc *sc = device_get_softc(dev);
773 u_int i;
774
775 destroy_dev(sc->cdev);
776
777 sx_xlock(&sc->connection_lock);
778 sc->detaching = true;
779 sx_xunlock(&sc->connection_lock);
780
781 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
782 EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
783
784 nvmf_destroy_sim(sc);
785 for (i = 0; i < sc->cdata->nn; i++) {
786 if (sc->ns[i] != NULL)
787 nvmf_destroy_ns(sc->ns[i]);
788 }
789 free(sc->ns, M_NVMF);
790
791 callout_drain(&sc->ka_tx_timer);
792 callout_drain(&sc->ka_rx_timer);
793
794 if (sc->admin != NULL)
795 nvmf_shutdown_controller(sc);
796
797 for (i = 0; i < sc->num_io_queues; i++) {
798 nvmf_destroy_qp(sc->io[i]);
799 }
800 free(sc->io, M_NVMF);
801
802 taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
803
804 if (sc->admin != NULL)
805 nvmf_destroy_qp(sc->admin);
806
807 nvmf_destroy_aer(sc);
808
809 sx_destroy(&sc->connection_lock);
810 free(sc->cdata, M_NVMF);
811 return (0);
812 }
813
814 static void
nvmf_rescan_ns_1(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data)815 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
816 const struct nvme_namespace_data *data)
817 {
818 struct nvmf_namespace *ns;
819
820 /* XXX: Needs locking around sc->ns[]. */
821 ns = sc->ns[nsid - 1];
822 if (data->nsze == 0) {
823 /* XXX: Needs locking */
824 if (ns != NULL) {
825 nvmf_destroy_ns(ns);
826 sc->ns[nsid - 1] = NULL;
827 }
828 } else {
829 /* XXX: Needs locking */
830 if (ns == NULL) {
831 sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
832 } else {
833 if (!nvmf_update_ns(ns, data)) {
834 nvmf_destroy_ns(ns);
835 sc->ns[nsid - 1] = NULL;
836 }
837 }
838 }
839
840 nvmf_sim_rescan_ns(sc, nsid);
841 }
842
843 void
nvmf_rescan_ns(struct nvmf_softc * sc,uint32_t nsid)844 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
845 {
846 struct nvmf_completion_status status;
847 struct nvme_namespace_data *data;
848
849 data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
850
851 nvmf_status_init(&status);
852 nvmf_status_wait_io(&status);
853 if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
854 &status, nvmf_io_complete, &status, M_WAITOK)) {
855 device_printf(sc->dev,
856 "failed to send IDENTIFY namespace %u command\n", nsid);
857 free(data, M_NVMF);
858 return;
859 }
860 nvmf_wait_for_reply(&status);
861
862 if (status.cqe.status != 0) {
863 device_printf(sc->dev,
864 "IDENTIFY namespace %u failed, status %#x\n", nsid,
865 le16toh(status.cqe.status));
866 free(data, M_NVMF);
867 return;
868 }
869
870 if (status.io_error != 0) {
871 device_printf(sc->dev,
872 "IDENTIFY namespace %u failed with I/O error %d\n",
873 nsid, status.io_error);
874 free(data, M_NVMF);
875 return;
876 }
877
878 nvme_namespace_data_swapbytes(data);
879
880 nvmf_rescan_ns_1(sc, nsid, data);
881
882 free(data, M_NVMF);
883 }
884
885 static void
nvmf_purge_namespaces(struct nvmf_softc * sc,uint32_t first_nsid,uint32_t next_valid_nsid)886 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
887 uint32_t next_valid_nsid)
888 {
889 struct nvmf_namespace *ns;
890
891 for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
892 {
893 /* XXX: Needs locking around sc->ns[]. */
894 ns = sc->ns[nsid - 1];
895 if (ns != NULL) {
896 nvmf_destroy_ns(ns);
897 sc->ns[nsid - 1] = NULL;
898
899 nvmf_sim_rescan_ns(sc, nsid);
900 }
901 }
902 }
903
904 static bool
nvmf_rescan_ns_cb(struct nvmf_softc * sc,uint32_t nsid,const struct nvme_namespace_data * data,void * arg)905 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
906 const struct nvme_namespace_data *data, void *arg)
907 {
908 uint32_t *last_nsid = arg;
909
910 /* Check for any gaps prior to this namespace. */
911 nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
912 *last_nsid = nsid;
913
914 nvmf_rescan_ns_1(sc, nsid, data);
915 return (true);
916 }
917
918 void
nvmf_rescan_all_ns(struct nvmf_softc * sc)919 nvmf_rescan_all_ns(struct nvmf_softc *sc)
920 {
921 uint32_t last_nsid;
922
923 last_nsid = 0;
924 if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
925 return;
926
927 /*
928 * Check for any namespace devices after the last active
929 * namespace.
930 */
931 nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
932 }
933
934 int
nvmf_passthrough_cmd(struct nvmf_softc * sc,struct nvme_pt_command * pt,bool admin)935 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
936 bool admin)
937 {
938 struct nvmf_completion_status status;
939 struct nvme_command cmd;
940 struct memdesc mem;
941 struct nvmf_host_qpair *qp;
942 struct nvmf_request *req;
943 void *buf;
944 int error;
945
946 if (pt->len > sc->max_xfer_size)
947 return (EINVAL);
948
949 buf = NULL;
950 if (pt->len != 0) {
951 /*
952 * XXX: Depending on the size we may want to pin the
953 * user pages and use a memdesc with vm_page_t's
954 * instead.
955 */
956 buf = malloc(pt->len, M_NVMF, M_WAITOK);
957 if (pt->is_read == 0) {
958 error = copyin(pt->buf, buf, pt->len);
959 if (error != 0) {
960 free(buf, M_NVMF);
961 return (error);
962 }
963 } else {
964 /* Ensure no kernel data is leaked to userland. */
965 memset(buf, 0, pt->len);
966 }
967 }
968
969 memset(&cmd, 0, sizeof(cmd));
970 cmd.opc = pt->cmd.opc;
971 cmd.fuse = pt->cmd.fuse;
972 cmd.nsid = pt->cmd.nsid;
973 cmd.cdw10 = pt->cmd.cdw10;
974 cmd.cdw11 = pt->cmd.cdw11;
975 cmd.cdw12 = pt->cmd.cdw12;
976 cmd.cdw13 = pt->cmd.cdw13;
977 cmd.cdw14 = pt->cmd.cdw14;
978 cmd.cdw15 = pt->cmd.cdw15;
979
980 sx_slock(&sc->connection_lock);
981 if (sc->admin == NULL || sc->detaching) {
982 device_printf(sc->dev,
983 "failed to send passthrough command\n");
984 error = ECONNABORTED;
985 sx_sunlock(&sc->connection_lock);
986 goto error;
987 }
988 if (admin)
989 qp = sc->admin;
990 else
991 qp = nvmf_select_io_queue(sc);
992 nvmf_status_init(&status);
993 req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
994 sx_sunlock(&sc->connection_lock);
995 if (req == NULL) {
996 device_printf(sc->dev, "failed to send passthrough command\n");
997 error = ECONNABORTED;
998 goto error;
999 }
1000
1001 if (pt->len != 0) {
1002 mem = memdesc_vaddr(buf, pt->len);
1003 nvmf_capsule_append_data(req->nc, &mem, pt->len,
1004 pt->is_read == 0, nvmf_io_complete, &status);
1005 nvmf_status_wait_io(&status);
1006 }
1007
1008 nvmf_submit_request(req);
1009 nvmf_wait_for_reply(&status);
1010
1011 memset(&pt->cpl, 0, sizeof(pt->cpl));
1012 pt->cpl.cdw0 = status.cqe.cdw0;
1013 pt->cpl.status = status.cqe.status;
1014
1015 error = status.io_error;
1016 if (error == 0 && pt->len != 0 && pt->is_read != 0)
1017 error = copyout(buf, pt->buf, pt->len);
1018 error:
1019 free(buf, M_NVMF);
1020 return (error);
1021 }
1022
1023 static int
nvmf_ioctl(struct cdev * cdev,u_long cmd,caddr_t arg,int flag,struct thread * td)1024 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1025 struct thread *td)
1026 {
1027 struct nvmf_softc *sc = cdev->si_drv1;
1028 struct nvme_get_nsid *gnsid;
1029 struct nvme_pt_command *pt;
1030 struct nvmf_reconnect_params *rp;
1031 struct nvmf_handoff_host *hh;
1032
1033 switch (cmd) {
1034 case NVME_PASSTHROUGH_CMD:
1035 pt = (struct nvme_pt_command *)arg;
1036 return (nvmf_passthrough_cmd(sc, pt, true));
1037 case NVME_GET_NSID:
1038 gnsid = (struct nvme_get_nsid *)arg;
1039 strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1040 sizeof(gnsid->cdev));
1041 gnsid->nsid = 0;
1042 return (0);
1043 case NVME_GET_MAX_XFER_SIZE:
1044 *(uint64_t *)arg = sc->max_xfer_size;
1045 return (0);
1046 case NVMF_RECONNECT_PARAMS:
1047 rp = (struct nvmf_reconnect_params *)arg;
1048 if ((sc->cdata->fcatt & 1) == 0)
1049 rp->cntlid = NVMF_CNTLID_DYNAMIC;
1050 else
1051 rp->cntlid = sc->cdata->ctrlr_id;
1052 memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
1053 return (0);
1054 case NVMF_RECONNECT_HOST:
1055 hh = (struct nvmf_handoff_host *)arg;
1056 return (nvmf_reconnect_host(sc, hh));
1057 default:
1058 return (ENOTTY);
1059 }
1060 }
1061
1062 static struct cdevsw nvmf_cdevsw = {
1063 .d_version = D_VERSION,
1064 .d_ioctl = nvmf_ioctl
1065 };
1066
1067 static int
nvmf_modevent(module_t mod,int what,void * arg)1068 nvmf_modevent(module_t mod, int what, void *arg)
1069 {
1070 switch (what) {
1071 case MOD_LOAD:
1072 return (nvmf_ctl_load());
1073 case MOD_QUIESCE:
1074 return (0);
1075 case MOD_UNLOAD:
1076 nvmf_ctl_unload();
1077 destroy_dev_drain(&nvmf_cdevsw);
1078 return (0);
1079 default:
1080 return (EOPNOTSUPP);
1081 }
1082 }
1083
1084 static device_method_t nvmf_methods[] = {
1085 /* Device interface */
1086 DEVMETHOD(device_probe, nvmf_probe),
1087 DEVMETHOD(device_attach, nvmf_attach),
1088 DEVMETHOD(device_detach, nvmf_detach),
1089 DEVMETHOD_END
1090 };
1091
1092 driver_t nvme_nvmf_driver = {
1093 "nvme",
1094 nvmf_methods,
1095 sizeof(struct nvmf_softc),
1096 };
1097
1098 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1099 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1100