xref: /freebsd/sys/dev/nvmf/host/nvmf.c (revision 6751f65e6af15348abdc6106cf54c8335d45e49b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/dnv.h>
12 #include <sys/eventhandler.h>
13 #include <sys/lock.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/module.h>
18 #include <sys/mutex.h>
19 #include <sys/nv.h>
20 #include <sys/reboot.h>
21 #include <sys/sx.h>
22 #include <sys/sysctl.h>
23 #include <sys/taskqueue.h>
24 #include <dev/nvme/nvme.h>
25 #include <dev/nvmf/nvmf.h>
26 #include <dev/nvmf/nvmf_transport.h>
27 #include <dev/nvmf/host/nvmf_var.h>
28 
29 static struct cdevsw nvmf_cdevsw;
30 
31 bool nvmf_fail_disconnect = false;
32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
33     &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
34 
35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
36 
37 static void	nvmf_disconnect_task(void *arg, int pending);
38 static void	nvmf_shutdown_pre_sync(void *arg, int howto);
39 static void	nvmf_shutdown_post_sync(void *arg, int howto);
40 
41 void
42 nvmf_complete(void *arg, const struct nvme_completion *cqe)
43 {
44 	struct nvmf_completion_status *status = arg;
45 	struct mtx *mtx;
46 
47 	status->cqe = *cqe;
48 	mtx = mtx_pool_find(mtxpool_sleep, status);
49 	mtx_lock(mtx);
50 	status->done = true;
51 	mtx_unlock(mtx);
52 	wakeup(status);
53 }
54 
55 void
56 nvmf_io_complete(void *arg, size_t xfered, int error)
57 {
58 	struct nvmf_completion_status *status = arg;
59 	struct mtx *mtx;
60 
61 	status->io_error = error;
62 	mtx = mtx_pool_find(mtxpool_sleep, status);
63 	mtx_lock(mtx);
64 	status->io_done = true;
65 	mtx_unlock(mtx);
66 	wakeup(status);
67 }
68 
69 void
70 nvmf_wait_for_reply(struct nvmf_completion_status *status)
71 {
72 	struct mtx *mtx;
73 
74 	mtx = mtx_pool_find(mtxpool_sleep, status);
75 	mtx_lock(mtx);
76 	while (!status->done || !status->io_done)
77 		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
78 	mtx_unlock(mtx);
79 }
80 
81 static int
82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
83     uint64_t *value)
84 {
85 	const struct nvmf_fabric_prop_get_rsp *rsp;
86 	struct nvmf_completion_status status;
87 
88 	nvmf_status_init(&status);
89 	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
90 	    M_WAITOK))
91 		return (ECONNABORTED);
92 	nvmf_wait_for_reply(&status);
93 
94 	if (status.cqe.status != 0) {
95 		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
96 		    le16toh(status.cqe.status));
97 		return (EIO);
98 	}
99 
100 	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
101 	if (size == 8)
102 		*value = le64toh(rsp->value.u64);
103 	else
104 		*value = le32toh(rsp->value.u32.low);
105 	return (0);
106 }
107 
108 static int
109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
110     uint64_t value)
111 {
112 	struct nvmf_completion_status status;
113 
114 	nvmf_status_init(&status);
115 	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
116 	    M_WAITOK))
117 		return (ECONNABORTED);
118 	nvmf_wait_for_reply(&status);
119 
120 	if (status.cqe.status != 0) {
121 		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
122 		    le16toh(status.cqe.status));
123 		return (EIO);
124 	}
125 	return (0);
126 }
127 
128 static void
129 nvmf_shutdown_controller(struct nvmf_softc *sc)
130 {
131 	uint64_t cc;
132 	int error;
133 
134 	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
135 	if (error != 0) {
136 		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
137 		return;
138 	}
139 
140 	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
141 
142 	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
143 	if (error != 0)
144 		device_printf(sc->dev,
145 		    "Failed to set CC to trigger shutdown\n");
146 }
147 
148 static void
149 nvmf_check_keep_alive(void *arg)
150 {
151 	struct nvmf_softc *sc = arg;
152 	int traffic;
153 
154 	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
155 	if (traffic == 0) {
156 		device_printf(sc->dev,
157 		    "disconnecting due to KeepAlive timeout\n");
158 		nvmf_disconnect(sc);
159 		return;
160 	}
161 
162 	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
163 }
164 
165 static void
166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
167 {
168 	struct nvmf_softc *sc = arg;
169 
170 	atomic_store_int(&sc->ka_active_rx_traffic, 1);
171 	if (cqe->status != 0) {
172 		device_printf(sc->dev,
173 		    "KeepAlive response reported status %#x\n",
174 		    le16toh(cqe->status));
175 	}
176 }
177 
178 static void
179 nvmf_send_keep_alive(void *arg)
180 {
181 	struct nvmf_softc *sc = arg;
182 	int traffic;
183 
184 	/*
185 	 * Don't bother sending a KeepAlive command if TKAS is active
186 	 * and another command has been sent during the interval.
187 	 */
188 	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
189 	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
190 	    sc, M_NOWAIT))
191 		device_printf(sc->dev,
192 		    "Failed to allocate KeepAlive command\n");
193 
194 	/* Clear ka_active_tx_traffic after sending the keep alive command. */
195 	atomic_store_int(&sc->ka_active_tx_traffic, 0);
196 
197 	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
198 }
199 
200 int
201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
202 {
203 	const nvlist_t *const *io;
204 	const nvlist_t *admin;
205 	nvlist_t *nvl;
206 	size_t i, num_io_queues;
207 	uint32_t qsize;
208 	int error;
209 
210 	error = nvmf_unpack_ioc_nvlist(nv, &nvl);
211 	if (error != 0)
212 		return (error);
213 
214 	if (!nvlist_exists_number(nvl, "trtype") ||
215 	    !nvlist_exists_nvlist(nvl, "admin") ||
216 	    !nvlist_exists_nvlist_array(nvl, "io") ||
217 	    !nvlist_exists_binary(nvl, "cdata"))
218 		goto invalid;
219 
220 	admin = nvlist_get_nvlist(nvl, "admin");
221 	if (!nvmf_validate_qpair_nvlist(admin, false))
222 		goto invalid;
223 	if (!nvlist_get_bool(admin, "admin"))
224 		goto invalid;
225 
226 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
227 	if (num_io_queues < 1)
228 		goto invalid;
229 	for (i = 0; i < num_io_queues; i++) {
230 		if (!nvmf_validate_qpair_nvlist(io[i], false))
231 			goto invalid;
232 	}
233 
234 	/* Require all I/O queues to be the same size. */
235 	qsize = nvlist_get_number(io[0], "qsize");
236 	for (i = 1; i < num_io_queues; i++) {
237 		if (nvlist_get_number(io[i], "qsize") != qsize)
238 			goto invalid;
239 	}
240 
241 	nvlist_get_binary(nvl, "cdata", &i);
242 	if (i != sizeof(struct nvme_controller_data))
243 		goto invalid;
244 
245 	*nvlp = nvl;
246 	return (0);
247 invalid:
248 	nvlist_destroy(nvl);
249 	return (EINVAL);
250 }
251 
252 static int
253 nvmf_probe(device_t dev)
254 {
255 	const nvlist_t *nvl = device_get_ivars(dev);
256 	const struct nvme_controller_data *cdata;
257 
258 	if (nvl == NULL)
259 		return (ENXIO);
260 
261 	cdata = nvlist_get_binary(nvl, "cdata", NULL);
262 	device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
263 	return (BUS_PROBE_DEFAULT);
264 }
265 
266 static int
267 nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl)
268 {
269 	const nvlist_t *const *io;
270 	const nvlist_t *admin;
271 	uint64_t kato;
272 	size_t num_io_queues;
273 	enum nvmf_trtype trtype;
274 	char name[16];
275 
276 	trtype = nvlist_get_number(nvl, "trtype");
277 	admin = nvlist_get_nvlist(nvl, "admin");
278 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
279 	kato = dnvlist_get_number(nvl, "kato", 0);
280 
281 	/* Setup the admin queue. */
282 	sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
283 	if (sc->admin == NULL) {
284 		device_printf(sc->dev, "Failed to setup admin queue\n");
285 		return (ENXIO);
286 	}
287 
288 	/* Setup I/O queues. */
289 	sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
290 	    M_WAITOK | M_ZERO);
291 	sc->num_io_queues = num_io_queues;
292 	for (u_int i = 0; i < sc->num_io_queues; i++) {
293 		snprintf(name, sizeof(name), "I/O queue %u", i);
294 		sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
295 		if (sc->io[i] == NULL) {
296 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
297 			    i + 1);
298 			return (ENXIO);
299 		}
300 	}
301 
302 	/* Start KeepAlive timers. */
303 	if (kato != 0) {
304 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
305 		    sc->cdata->ctratt) != 0;
306 		sc->ka_rx_sbt = mstosbt(kato);
307 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
308 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
309 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
310 		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
311 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
312 	}
313 
314 	memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
315 	    sizeof(*sc->cdata));
316 
317 	return (0);
318 }
319 
320 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
321     const struct nvme_namespace_data *, void *);
322 
323 static bool
324 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
325     struct nvme_namespace_data *data, uint32_t *nsidp,
326     nvmf_scan_active_ns_cb *cb, void *cb_arg)
327 {
328 	struct nvmf_completion_status status;
329 	uint32_t nsid;
330 
331 	nvmf_status_init(&status);
332 	nvmf_status_wait_io(&status);
333 	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
334 	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
335 		device_printf(sc->dev,
336 		    "failed to send IDENTIFY active namespaces command\n");
337 		return (false);
338 	}
339 	nvmf_wait_for_reply(&status);
340 
341 	if (status.cqe.status != 0) {
342 		device_printf(sc->dev,
343 		    "IDENTIFY active namespaces failed, status %#x\n",
344 		    le16toh(status.cqe.status));
345 		return (false);
346 	}
347 
348 	if (status.io_error != 0) {
349 		device_printf(sc->dev,
350 		    "IDENTIFY active namespaces failed with I/O error %d\n",
351 		    status.io_error);
352 		return (false);
353 	}
354 
355 	for (u_int i = 0; i < nitems(nslist->ns); i++) {
356 		nsid = nslist->ns[i];
357 		if (nsid == 0) {
358 			*nsidp = 0;
359 			return (true);
360 		}
361 
362 		nvmf_status_init(&status);
363 		nvmf_status_wait_io(&status);
364 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
365 		    &status, nvmf_io_complete, &status, M_WAITOK)) {
366 			device_printf(sc->dev,
367 			    "failed to send IDENTIFY namespace %u command\n",
368 			    nsid);
369 			return (false);
370 		}
371 		nvmf_wait_for_reply(&status);
372 
373 		if (status.cqe.status != 0) {
374 			device_printf(sc->dev,
375 			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
376 			    le16toh(status.cqe.status));
377 			return (false);
378 		}
379 
380 		if (status.io_error != 0) {
381 			device_printf(sc->dev,
382 			    "IDENTIFY namespace %u failed with I/O error %d\n",
383 			    nsid, status.io_error);
384 			return (false);
385 		}
386 
387 		nvme_namespace_data_swapbytes(data);
388 		if (!cb(sc, nsid, data, cb_arg))
389 			return (false);
390 	}
391 
392 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
393 
394 	if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
395 		*nsidp = 0;
396 	else
397 		*nsidp = nsid;
398 	return (true);
399 }
400 
401 static bool
402 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
403     void *cb_arg)
404 {
405 	struct nvme_namespace_data *data;
406 	struct nvme_ns_list *nslist;
407 	uint32_t nsid;
408 	bool retval;
409 
410 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
411 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
412 
413 	nsid = 0;
414 	retval = true;
415 	for (;;) {
416 		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
417 		    cb_arg)) {
418 			retval = false;
419 			break;
420 		}
421 		if (nsid == 0)
422 			break;
423 	}
424 
425 	free(data, M_NVMF);
426 	free(nslist, M_NVMF);
427 	return (retval);
428 }
429 
430 static bool
431 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
432     const struct nvme_namespace_data *data, void *arg __unused)
433 {
434 	if (sc->ns[nsid - 1] != NULL) {
435 		device_printf(sc->dev,
436 		    "duplicate namespace %u in active namespace list\n",
437 		    nsid);
438 		return (false);
439 	}
440 
441 	/*
442 	 * As in nvme_ns_construct, a size of zero indicates an
443 	 * invalid namespace.
444 	 */
445 	if (data->nsze == 0) {
446 		device_printf(sc->dev,
447 		    "ignoring active namespace %u with zero size\n", nsid);
448 		return (true);
449 	}
450 
451 	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
452 
453 	nvmf_sim_rescan_ns(sc, nsid);
454 	return (true);
455 }
456 
457 static bool
458 nvmf_add_namespaces(struct nvmf_softc *sc)
459 {
460 	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
461 	    M_WAITOK | M_ZERO);
462 	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
463 }
464 
465 static int
466 nvmf_attach(device_t dev)
467 {
468 	struct make_dev_args mda;
469 	struct nvmf_softc *sc = device_get_softc(dev);
470 	const nvlist_t *nvl = device_get_ivars(dev);
471 	const nvlist_t * const *io;
472 	struct sysctl_oid *oid;
473 	uint64_t val;
474 	u_int i;
475 	int error;
476 
477 	if (nvl == NULL)
478 		return (ENXIO);
479 
480 	sc->dev = dev;
481 	sc->trtype = nvlist_get_number(nvl, "trtype");
482 	callout_init(&sc->ka_rx_timer, 1);
483 	callout_init(&sc->ka_tx_timer, 1);
484 	sx_init(&sc->connection_lock, "nvmf connection");
485 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
486 
487 	oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
488 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
489 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
490 	sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
491 
492 	sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
493 
494 	nvmf_init_aer(sc);
495 
496 	error = nvmf_establish_connection(sc, nvl);
497 	if (error != 0)
498 		goto out;
499 
500 	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
501 	if (error != 0) {
502 		device_printf(sc->dev, "Failed to fetch CAP\n");
503 		error = ENXIO;
504 		goto out;
505 	}
506 
507 	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
508 	if (error != 0) {
509 		device_printf(sc->dev, "Failed to fetch VS\n");
510 		error = ENXIO;
511 		goto out;
512 	}
513 	sc->vs = val;
514 
515 	/* Honor MDTS if it is set. */
516 	sc->max_xfer_size = maxphys;
517 	if (sc->cdata->mdts != 0) {
518 		sc->max_xfer_size = ulmin(sc->max_xfer_size,
519 		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
520 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
521 	}
522 
523 	io = nvlist_get_nvlist_array(nvl, "io", NULL);
524 	sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
525 	    sc->num_io_queues;
526 
527 	error = nvmf_init_sim(sc);
528 	if (error != 0)
529 		goto out;
530 
531 	error = nvmf_start_aer(sc);
532 	if (error != 0) {
533 		nvmf_destroy_sim(sc);
534 		goto out;
535 	}
536 
537 	if (!nvmf_add_namespaces(sc)) {
538 		nvmf_destroy_sim(sc);
539 		goto out;
540 	}
541 
542 	make_dev_args_init(&mda);
543 	mda.mda_devsw = &nvmf_cdevsw;
544 	mda.mda_uid = UID_ROOT;
545 	mda.mda_gid = GID_WHEEL;
546 	mda.mda_mode = 0600;
547 	mda.mda_si_drv1 = sc;
548 	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
549 	if (error != 0) {
550 		nvmf_destroy_sim(sc);
551 		goto out;
552 	}
553 
554 	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
555 	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
556 	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
557 	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);
558 
559 	return (0);
560 out:
561 	if (sc->ns != NULL) {
562 		for (i = 0; i < sc->cdata->nn; i++) {
563 			if (sc->ns[i] != NULL)
564 				nvmf_destroy_ns(sc->ns[i]);
565 		}
566 		free(sc->ns, M_NVMF);
567 	}
568 
569 	callout_drain(&sc->ka_tx_timer);
570 	callout_drain(&sc->ka_rx_timer);
571 
572 	if (sc->admin != NULL)
573 		nvmf_shutdown_controller(sc);
574 
575 	for (i = 0; i < sc->num_io_queues; i++) {
576 		if (sc->io[i] != NULL)
577 			nvmf_destroy_qp(sc->io[i]);
578 	}
579 	free(sc->io, M_NVMF);
580 	if (sc->admin != NULL)
581 		nvmf_destroy_qp(sc->admin);
582 
583 	nvmf_destroy_aer(sc);
584 
585 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
586 	sx_destroy(&sc->connection_lock);
587 	free(sc->cdata, M_NVMF);
588 	return (error);
589 }
590 
591 void
592 nvmf_disconnect(struct nvmf_softc *sc)
593 {
594 	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
595 }
596 
597 static void
598 nvmf_disconnect_task(void *arg, int pending __unused)
599 {
600 	struct nvmf_softc *sc = arg;
601 	u_int i;
602 
603 	sx_xlock(&sc->connection_lock);
604 	if (sc->admin == NULL) {
605 		/*
606 		 * Ignore transport errors if there is no active
607 		 * association.
608 		 */
609 		sx_xunlock(&sc->connection_lock);
610 		return;
611 	}
612 
613 	if (sc->detaching) {
614 		if (sc->admin != NULL) {
615 			/*
616 			 * This unsticks the detach process if a
617 			 * transport error occurs during detach.
618 			 */
619 			nvmf_shutdown_qp(sc->admin);
620 		}
621 		sx_xunlock(&sc->connection_lock);
622 		return;
623 	}
624 
625 	if (sc->cdev == NULL) {
626 		/*
627 		 * Transport error occurred during attach (nvmf_add_namespaces).
628 		 * Shutdown the admin queue.
629 		 */
630 		nvmf_shutdown_qp(sc->admin);
631 		sx_xunlock(&sc->connection_lock);
632 		return;
633 	}
634 
635 	callout_drain(&sc->ka_tx_timer);
636 	callout_drain(&sc->ka_rx_timer);
637 	sc->ka_traffic = false;
638 
639 	/* Quiesce namespace consumers. */
640 	nvmf_disconnect_sim(sc);
641 	for (i = 0; i < sc->cdata->nn; i++) {
642 		if (sc->ns[i] != NULL)
643 			nvmf_disconnect_ns(sc->ns[i]);
644 	}
645 
646 	/* Shutdown the existing qpairs. */
647 	for (i = 0; i < sc->num_io_queues; i++) {
648 		nvmf_destroy_qp(sc->io[i]);
649 	}
650 	free(sc->io, M_NVMF);
651 	sc->io = NULL;
652 	sc->num_io_queues = 0;
653 	nvmf_destroy_qp(sc->admin);
654 	sc->admin = NULL;
655 
656 	sx_xunlock(&sc->connection_lock);
657 }
658 
659 static int
660 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
661 {
662 	const struct nvme_controller_data *cdata;
663 	nvlist_t *nvl;
664 	u_int i;
665 	int error;
666 
667 	error = nvmf_copyin_handoff(nv, &nvl);
668 	if (error != 0)
669 		return (error);
670 
671 	/* XXX: Should we permit changing the transport type? */
672 	if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
673 		device_printf(sc->dev,
674 		    "transport type mismatch on reconnect\n");
675 		return (EINVAL);
676 	}
677 
678 	sx_xlock(&sc->connection_lock);
679 	if (sc->admin != NULL || sc->detaching) {
680 		error = EBUSY;
681 		goto out;
682 	}
683 
684 	/*
685 	 * Ensure this is for the same controller.  Note that the
686 	 * controller ID can vary across associations if the remote
687 	 * system is using the dynamic controller model.  This merely
688 	 * ensures the new association is connected to the same NVMe
689 	 * subsystem.
690 	 */
691 	cdata = nvlist_get_binary(nvl, "cdata", NULL);
692 	if (memcmp(sc->cdata->subnqn, cdata->subnqn,
693 	    sizeof(cdata->subnqn)) != 0) {
694 		device_printf(sc->dev,
695 		    "controller subsystem NQN mismatch on reconnect\n");
696 		error = EINVAL;
697 		goto out;
698 	}
699 
700 	/*
701 	 * XXX: Require same number and size of I/O queues so that
702 	 * max_pending_io is still correct?
703 	 */
704 
705 	error = nvmf_establish_connection(sc, nvl);
706 	if (error != 0)
707 		goto out;
708 
709 	error = nvmf_start_aer(sc);
710 	if (error != 0)
711 		goto out;
712 
713 	device_printf(sc->dev,
714 	    "established new association with %u I/O queues\n",
715 	    sc->num_io_queues);
716 
717 	/* Restart namespace consumers. */
718 	for (i = 0; i < sc->cdata->nn; i++) {
719 		if (sc->ns[i] != NULL)
720 			nvmf_reconnect_ns(sc->ns[i]);
721 	}
722 	nvmf_reconnect_sim(sc);
723 
724 	nvmf_rescan_all_ns(sc);
725 out:
726 	sx_xunlock(&sc->connection_lock);
727 	nvlist_destroy(nvl);
728 	return (error);
729 }
730 
731 static void
732 nvmf_shutdown_pre_sync(void *arg, int howto)
733 {
734 	struct nvmf_softc *sc = arg;
735 
736 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
737 		return;
738 
739 	/*
740 	 * If this association is disconnected, abort any pending
741 	 * requests with an error to permit filesystems to unmount
742 	 * without hanging.
743 	 */
744 	sx_xlock(&sc->connection_lock);
745 	if (sc->admin != NULL || sc->detaching) {
746 		sx_xunlock(&sc->connection_lock);
747 		return;
748 	}
749 
750 	for (u_int i = 0; i < sc->cdata->nn; i++) {
751 		if (sc->ns[i] != NULL)
752 			nvmf_shutdown_ns(sc->ns[i]);
753 	}
754 	nvmf_shutdown_sim(sc);
755 	sx_xunlock(&sc->connection_lock);
756 }
757 
758 static void
759 nvmf_shutdown_post_sync(void *arg, int howto)
760 {
761 	struct nvmf_softc *sc = arg;
762 
763 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
764 		return;
765 
766 	/*
767 	 * If this association is connected, disconnect gracefully.
768 	 */
769 	sx_xlock(&sc->connection_lock);
770 	if (sc->admin == NULL || sc->detaching) {
771 		sx_xunlock(&sc->connection_lock);
772 		return;
773 	}
774 
775 	callout_drain(&sc->ka_tx_timer);
776 	callout_drain(&sc->ka_rx_timer);
777 
778 	nvmf_shutdown_controller(sc);
779 
780 	/*
781 	 * Quiesce consumers so that any commands submitted after this
782 	 * fail with an error.  Notably, nda(4) calls nda_flush() from
783 	 * a post_sync handler that might be ordered after this one.
784 	 */
785 	for (u_int i = 0; i < sc->cdata->nn; i++) {
786 		if (sc->ns[i] != NULL)
787 			nvmf_shutdown_ns(sc->ns[i]);
788 	}
789 	nvmf_shutdown_sim(sc);
790 
791 	for (u_int i = 0; i < sc->num_io_queues; i++) {
792 		nvmf_destroy_qp(sc->io[i]);
793 	}
794 	nvmf_destroy_qp(sc->admin);
795 	sc->admin = NULL;
796 	sx_xunlock(&sc->connection_lock);
797 }
798 
799 static int
800 nvmf_detach(device_t dev)
801 {
802 	struct nvmf_softc *sc = device_get_softc(dev);
803 	u_int i;
804 
805 	destroy_dev(sc->cdev);
806 
807 	sx_xlock(&sc->connection_lock);
808 	sc->detaching = true;
809 	sx_xunlock(&sc->connection_lock);
810 
811 	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
812 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
813 
814 	nvmf_destroy_sim(sc);
815 	for (i = 0; i < sc->cdata->nn; i++) {
816 		if (sc->ns[i] != NULL)
817 			nvmf_destroy_ns(sc->ns[i]);
818 	}
819 	free(sc->ns, M_NVMF);
820 
821 	callout_drain(&sc->ka_tx_timer);
822 	callout_drain(&sc->ka_rx_timer);
823 
824 	if (sc->admin != NULL)
825 		nvmf_shutdown_controller(sc);
826 
827 	for (i = 0; i < sc->num_io_queues; i++) {
828 		nvmf_destroy_qp(sc->io[i]);
829 	}
830 	free(sc->io, M_NVMF);
831 
832 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
833 
834 	if (sc->admin != NULL)
835 		nvmf_destroy_qp(sc->admin);
836 
837 	nvmf_destroy_aer(sc);
838 
839 	sx_destroy(&sc->connection_lock);
840 	free(sc->cdata, M_NVMF);
841 	return (0);
842 }
843 
844 static void
845 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
846     const struct nvme_namespace_data *data)
847 {
848 	struct nvmf_namespace *ns;
849 
850 	/* XXX: Needs locking around sc->ns[]. */
851 	ns = sc->ns[nsid - 1];
852 	if (data->nsze == 0) {
853 		/* XXX: Needs locking */
854 		if (ns != NULL) {
855 			nvmf_destroy_ns(ns);
856 			sc->ns[nsid - 1] = NULL;
857 		}
858 	} else {
859 		/* XXX: Needs locking */
860 		if (ns == NULL) {
861 			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
862 		} else {
863 			if (!nvmf_update_ns(ns, data)) {
864 				nvmf_destroy_ns(ns);
865 				sc->ns[nsid - 1] = NULL;
866 			}
867 		}
868 	}
869 
870 	nvmf_sim_rescan_ns(sc, nsid);
871 }
872 
873 void
874 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
875 {
876 	struct nvmf_completion_status status;
877 	struct nvme_namespace_data *data;
878 
879 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
880 
881 	nvmf_status_init(&status);
882 	nvmf_status_wait_io(&status);
883 	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
884 	    &status, nvmf_io_complete, &status, M_WAITOK)) {
885 		device_printf(sc->dev,
886 		    "failed to send IDENTIFY namespace %u command\n", nsid);
887 		free(data, M_NVMF);
888 		return;
889 	}
890 	nvmf_wait_for_reply(&status);
891 
892 	if (status.cqe.status != 0) {
893 		device_printf(sc->dev,
894 		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
895 		    le16toh(status.cqe.status));
896 		free(data, M_NVMF);
897 		return;
898 	}
899 
900 	if (status.io_error != 0) {
901 		device_printf(sc->dev,
902 		    "IDENTIFY namespace %u failed with I/O error %d\n",
903 		    nsid, status.io_error);
904 		free(data, M_NVMF);
905 		return;
906 	}
907 
908 	nvme_namespace_data_swapbytes(data);
909 
910 	nvmf_rescan_ns_1(sc, nsid, data);
911 
912 	free(data, M_NVMF);
913 }
914 
915 static void
916 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
917     uint32_t next_valid_nsid)
918 {
919 	struct nvmf_namespace *ns;
920 
921 	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
922 	{
923 		/* XXX: Needs locking around sc->ns[]. */
924 		ns = sc->ns[nsid - 1];
925 		if (ns != NULL) {
926 			nvmf_destroy_ns(ns);
927 			sc->ns[nsid - 1] = NULL;
928 
929 			nvmf_sim_rescan_ns(sc, nsid);
930 		}
931 	}
932 }
933 
934 static bool
935 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
936     const struct nvme_namespace_data *data, void *arg)
937 {
938 	uint32_t *last_nsid = arg;
939 
940 	/* Check for any gaps prior to this namespace. */
941 	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
942 	*last_nsid = nsid;
943 
944 	nvmf_rescan_ns_1(sc, nsid, data);
945 	return (true);
946 }
947 
948 void
949 nvmf_rescan_all_ns(struct nvmf_softc *sc)
950 {
951 	uint32_t last_nsid;
952 
953 	last_nsid = 0;
954 	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
955 		return;
956 
957 	/*
958 	 * Check for any namespace devices after the last active
959 	 * namespace.
960 	 */
961 	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
962 }
963 
964 int
965 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
966     bool admin)
967 {
968 	struct nvmf_completion_status status;
969 	struct nvme_command cmd;
970 	struct memdesc mem;
971 	struct nvmf_host_qpair *qp;
972 	struct nvmf_request *req;
973 	void *buf;
974 	int error;
975 
976 	if (pt->len > sc->max_xfer_size)
977 		return (EINVAL);
978 
979 	buf = NULL;
980 	if (pt->len != 0) {
981 		/*
982 		 * XXX: Depending on the size we may want to pin the
983 		 * user pages and use a memdesc with vm_page_t's
984 		 * instead.
985 		 */
986 		buf = malloc(pt->len, M_NVMF, M_WAITOK);
987 		if (pt->is_read == 0) {
988 			error = copyin(pt->buf, buf, pt->len);
989 			if (error != 0) {
990 				free(buf, M_NVMF);
991 				return (error);
992 			}
993 		} else {
994 			/* Ensure no kernel data is leaked to userland. */
995 			memset(buf, 0, pt->len);
996 		}
997 	}
998 
999 	memset(&cmd, 0, sizeof(cmd));
1000 	cmd.opc = pt->cmd.opc;
1001 	cmd.fuse = pt->cmd.fuse;
1002 	cmd.nsid = pt->cmd.nsid;
1003 	cmd.cdw10 = pt->cmd.cdw10;
1004 	cmd.cdw11 = pt->cmd.cdw11;
1005 	cmd.cdw12 = pt->cmd.cdw12;
1006 	cmd.cdw13 = pt->cmd.cdw13;
1007 	cmd.cdw14 = pt->cmd.cdw14;
1008 	cmd.cdw15 = pt->cmd.cdw15;
1009 
1010 	sx_slock(&sc->connection_lock);
1011 	if (sc->admin == NULL || sc->detaching) {
1012 		device_printf(sc->dev,
1013 		    "failed to send passthrough command\n");
1014 		error = ECONNABORTED;
1015 		sx_sunlock(&sc->connection_lock);
1016 		goto error;
1017 	}
1018 	if (admin)
1019 		qp = sc->admin;
1020 	else
1021 		qp = nvmf_select_io_queue(sc);
1022 	nvmf_status_init(&status);
1023 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
1024 	sx_sunlock(&sc->connection_lock);
1025 	if (req == NULL) {
1026 		device_printf(sc->dev, "failed to send passthrough command\n");
1027 		error = ECONNABORTED;
1028 		goto error;
1029 	}
1030 
1031 	if (pt->len != 0) {
1032 		mem = memdesc_vaddr(buf, pt->len);
1033 		nvmf_capsule_append_data(req->nc, &mem, pt->len,
1034 		    pt->is_read == 0, nvmf_io_complete, &status);
1035 		nvmf_status_wait_io(&status);
1036 	}
1037 
1038 	nvmf_submit_request(req);
1039 	nvmf_wait_for_reply(&status);
1040 
1041 	memset(&pt->cpl, 0, sizeof(pt->cpl));
1042 	pt->cpl.cdw0 = status.cqe.cdw0;
1043 	pt->cpl.status = status.cqe.status;
1044 
1045 	error = status.io_error;
1046 	if (error == 0 && pt->len != 0 && pt->is_read != 0)
1047 		error = copyout(buf, pt->buf, pt->len);
1048 error:
1049 	free(buf, M_NVMF);
1050 	return (error);
1051 }
1052 
1053 static int
1054 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1055 {
1056 	nvlist_t *nvl;
1057 	int error;
1058 
1059 	nvl = nvlist_create(0);
1060 
1061 	sx_slock(&sc->connection_lock);
1062 	if ((sc->cdata->fcatt & 1) == 0)
1063 		nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC);
1064 	else
1065 		nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id);
1066 	nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn);
1067 	sx_sunlock(&sc->connection_lock);
1068 
1069 	error = nvmf_pack_ioc_nvlist(nvl, nv);
1070 	nvlist_destroy(nvl);
1071 	return (error);
1072 }
1073 
1074 static int
1075 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1076     struct thread *td)
1077 {
1078 	struct nvmf_softc *sc = cdev->si_drv1;
1079 	struct nvme_get_nsid *gnsid;
1080 	struct nvme_pt_command *pt;
1081 	struct nvmf_ioc_nv *nv;
1082 
1083 	switch (cmd) {
1084 	case NVME_PASSTHROUGH_CMD:
1085 		pt = (struct nvme_pt_command *)arg;
1086 		return (nvmf_passthrough_cmd(sc, pt, true));
1087 	case NVME_GET_NSID:
1088 		gnsid = (struct nvme_get_nsid *)arg;
1089 		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1090 		    sizeof(gnsid->cdev));
1091 		gnsid->nsid = 0;
1092 		return (0);
1093 	case NVME_GET_MAX_XFER_SIZE:
1094 		*(uint64_t *)arg = sc->max_xfer_size;
1095 		return (0);
1096 	case NVMF_RECONNECT_PARAMS:
1097 		nv = (struct nvmf_ioc_nv *)arg;
1098 		return (nvmf_reconnect_params(sc, nv));
1099 	case NVMF_RECONNECT_HOST:
1100 		nv = (struct nvmf_ioc_nv *)arg;
1101 		return (nvmf_reconnect_host(sc, nv));
1102 	default:
1103 		return (ENOTTY);
1104 	}
1105 }
1106 
1107 static struct cdevsw nvmf_cdevsw = {
1108 	.d_version = D_VERSION,
1109 	.d_ioctl = nvmf_ioctl
1110 };
1111 
1112 static int
1113 nvmf_modevent(module_t mod, int what, void *arg)
1114 {
1115 	switch (what) {
1116 	case MOD_LOAD:
1117 		return (nvmf_ctl_load());
1118 	case MOD_QUIESCE:
1119 		return (0);
1120 	case MOD_UNLOAD:
1121 		nvmf_ctl_unload();
1122 		destroy_dev_drain(&nvmf_cdevsw);
1123 		return (0);
1124 	default:
1125 		return (EOPNOTSUPP);
1126 	}
1127 }
1128 
1129 static device_method_t nvmf_methods[] = {
1130 	/* Device interface */
1131 	DEVMETHOD(device_probe,     nvmf_probe),
1132 	DEVMETHOD(device_attach,    nvmf_attach),
1133 	DEVMETHOD(device_detach,    nvmf_detach),
1134 	DEVMETHOD_END
1135 };
1136 
1137 driver_t nvme_nvmf_driver = {
1138 	"nvme",
1139 	nvmf_methods,
1140 	sizeof(struct nvmf_softc),
1141 };
1142 
1143 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1144 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1145