xref: /freebsd/sys/dev/nvmf/host/nvmf.c (revision f65d0b18d9372b522e247c7bd58422a7ab3d30d8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/eventhandler.h>
12 #include <sys/lock.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/memdesc.h>
16 #include <sys/module.h>
17 #include <sys/mutex.h>
18 #include <sys/reboot.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/taskqueue.h>
22 #include <dev/nvme/nvme.h>
23 #include <dev/nvmf/nvmf.h>
24 #include <dev/nvmf/nvmf_transport.h>
25 #include <dev/nvmf/host/nvmf_var.h>
26 
27 static struct cdevsw nvmf_cdevsw;
28 
29 bool nvmf_fail_disconnect = false;
30 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
31     &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
32 
33 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
34 
35 static void	nvmf_disconnect_task(void *arg, int pending);
36 static void	nvmf_shutdown_pre_sync(void *arg, int howto);
37 static void	nvmf_shutdown_post_sync(void *arg, int howto);
38 
39 void
40 nvmf_complete(void *arg, const struct nvme_completion *cqe)
41 {
42 	struct nvmf_completion_status *status = arg;
43 	struct mtx *mtx;
44 
45 	status->cqe = *cqe;
46 	mtx = mtx_pool_find(mtxpool_sleep, status);
47 	mtx_lock(mtx);
48 	status->done = true;
49 	mtx_unlock(mtx);
50 	wakeup(status);
51 }
52 
53 void
54 nvmf_io_complete(void *arg, size_t xfered, int error)
55 {
56 	struct nvmf_completion_status *status = arg;
57 	struct mtx *mtx;
58 
59 	status->io_error = error;
60 	mtx = mtx_pool_find(mtxpool_sleep, status);
61 	mtx_lock(mtx);
62 	status->io_done = true;
63 	mtx_unlock(mtx);
64 	wakeup(status);
65 }
66 
67 void
68 nvmf_wait_for_reply(struct nvmf_completion_status *status)
69 {
70 	struct mtx *mtx;
71 
72 	mtx = mtx_pool_find(mtxpool_sleep, status);
73 	mtx_lock(mtx);
74 	while (!status->done || !status->io_done)
75 		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
76 	mtx_unlock(mtx);
77 }
78 
79 static int
80 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
81     uint64_t *value)
82 {
83 	const struct nvmf_fabric_prop_get_rsp *rsp;
84 	struct nvmf_completion_status status;
85 
86 	nvmf_status_init(&status);
87 	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
88 	    M_WAITOK))
89 		return (ECONNABORTED);
90 	nvmf_wait_for_reply(&status);
91 
92 	if (status.cqe.status != 0) {
93 		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
94 		    le16toh(status.cqe.status));
95 		return (EIO);
96 	}
97 
98 	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
99 	if (size == 8)
100 		*value = le64toh(rsp->value.u64);
101 	else
102 		*value = le32toh(rsp->value.u32.low);
103 	return (0);
104 }
105 
106 static int
107 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
108     uint64_t value)
109 {
110 	struct nvmf_completion_status status;
111 
112 	nvmf_status_init(&status);
113 	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
114 	    M_WAITOK))
115 		return (ECONNABORTED);
116 	nvmf_wait_for_reply(&status);
117 
118 	if (status.cqe.status != 0) {
119 		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
120 		    le16toh(status.cqe.status));
121 		return (EIO);
122 	}
123 	return (0);
124 }
125 
126 static void
127 nvmf_shutdown_controller(struct nvmf_softc *sc)
128 {
129 	uint64_t cc;
130 	int error;
131 
132 	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
133 	if (error != 0) {
134 		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
135 		return;
136 	}
137 
138 	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
139 
140 	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
141 	if (error != 0)
142 		device_printf(sc->dev,
143 		    "Failed to set CC to trigger shutdown\n");
144 }
145 
146 static void
147 nvmf_check_keep_alive(void *arg)
148 {
149 	struct nvmf_softc *sc = arg;
150 	int traffic;
151 
152 	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
153 	if (traffic == 0) {
154 		device_printf(sc->dev,
155 		    "disconnecting due to KeepAlive timeout\n");
156 		nvmf_disconnect(sc);
157 		return;
158 	}
159 
160 	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
161 }
162 
163 static void
164 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
165 {
166 	struct nvmf_softc *sc = arg;
167 
168 	atomic_store_int(&sc->ka_active_rx_traffic, 1);
169 	if (cqe->status != 0) {
170 		device_printf(sc->dev,
171 		    "KeepAlive response reported status %#x\n",
172 		    le16toh(cqe->status));
173 	}
174 }
175 
176 static void
177 nvmf_send_keep_alive(void *arg)
178 {
179 	struct nvmf_softc *sc = arg;
180 	int traffic;
181 
182 	/*
183 	 * Don't bother sending a KeepAlive command if TKAS is active
184 	 * and another command has been sent during the interval.
185 	 */
186 	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
187 	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
188 	    sc, M_NOWAIT))
189 		device_printf(sc->dev,
190 		    "Failed to allocate KeepAlive command\n");
191 
192 	/* Clear ka_active_tx_traffic after sending the keep alive command. */
193 	atomic_store_int(&sc->ka_active_tx_traffic, 0);
194 
195 	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
196 }
197 
198 int
199 nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
200 {
201 	size_t len;
202 	u_int i;
203 	int error;
204 
205 	memset(ivars, 0, sizeof(*ivars));
206 
207 	if (!hh->admin.admin || hh->num_io_queues < 1)
208 		return (EINVAL);
209 
210 	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
211 	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
212 	if (error != 0)
213 		goto out;
214 	nvme_controller_data_swapbytes(ivars->cdata);
215 
216 	len = hh->num_io_queues * sizeof(*ivars->io_params);
217 	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
218 	error = copyin(hh->io, ivars->io_params, len);
219 	if (error != 0)
220 		goto out;
221 	for (i = 0; i < hh->num_io_queues; i++) {
222 		if (ivars->io_params[i].admin) {
223 			error = EINVAL;
224 			goto out;
225 		}
226 
227 		/* Require all I/O queues to be the same size. */
228 		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
229 			error = EINVAL;
230 			goto out;
231 		}
232 	}
233 
234 	ivars->hh = hh;
235 	return (0);
236 
237 out:
238 	free(ivars->io_params, M_NVMF);
239 	free(ivars->cdata, M_NVMF);
240 	return (error);
241 }
242 
243 void
244 nvmf_free_ivars(struct nvmf_ivars *ivars)
245 {
246 	free(ivars->io_params, M_NVMF);
247 	free(ivars->cdata, M_NVMF);
248 }
249 
250 static int
251 nvmf_probe(device_t dev)
252 {
253 	struct nvmf_ivars *ivars = device_get_ivars(dev);
254 	char desc[260];
255 
256 	if (ivars == NULL)
257 		return (ENXIO);
258 
259 	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
260 	device_set_desc_copy(dev, desc);
261 	return (BUS_PROBE_DEFAULT);
262 }
263 
264 static int
265 nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
266 {
267 	char name[16];
268 
269 	/* Setup the admin queue. */
270 	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
271 	    "admin queue");
272 	if (sc->admin == NULL) {
273 		device_printf(sc->dev, "Failed to setup admin queue\n");
274 		return (ENXIO);
275 	}
276 
277 	/* Setup I/O queues. */
278 	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
279 	    M_WAITOK | M_ZERO);
280 	sc->num_io_queues = ivars->hh->num_io_queues;
281 	for (u_int i = 0; i < sc->num_io_queues; i++) {
282 		snprintf(name, sizeof(name), "I/O queue %u", i);
283 		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
284 		    &ivars->io_params[i], name);
285 		if (sc->io[i] == NULL) {
286 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
287 			    i + 1);
288 			return (ENXIO);
289 		}
290 	}
291 
292 	/* Start KeepAlive timers. */
293 	if (ivars->hh->kato != 0) {
294 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
295 		    sc->cdata->ctratt) != 0;
296 		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
297 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
298 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
299 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
300 		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
301 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
302 	}
303 
304 	return (0);
305 }
306 
307 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
308     const struct nvme_namespace_data *, void *);
309 
310 static bool
311 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
312     struct nvme_namespace_data *data, uint32_t *nsidp,
313     nvmf_scan_active_ns_cb *cb, void *cb_arg)
314 {
315 	struct nvmf_completion_status status;
316 	uint32_t nsid;
317 
318 	nvmf_status_init(&status);
319 	nvmf_status_wait_io(&status);
320 	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
321 	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
322 		device_printf(sc->dev,
323 		    "failed to send IDENTIFY active namespaces command\n");
324 		return (false);
325 	}
326 	nvmf_wait_for_reply(&status);
327 
328 	if (status.cqe.status != 0) {
329 		device_printf(sc->dev,
330 		    "IDENTIFY active namespaces failed, status %#x\n",
331 		    le16toh(status.cqe.status));
332 		return (false);
333 	}
334 
335 	if (status.io_error != 0) {
336 		device_printf(sc->dev,
337 		    "IDENTIFY active namespaces failed with I/O error %d\n",
338 		    status.io_error);
339 		return (false);
340 	}
341 
342 	for (u_int i = 0; i < nitems(nslist->ns); i++) {
343 		nsid = nslist->ns[i];
344 		if (nsid == 0) {
345 			*nsidp = 0;
346 			return (true);
347 		}
348 
349 		nvmf_status_init(&status);
350 		nvmf_status_wait_io(&status);
351 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
352 		    &status, nvmf_io_complete, &status, M_WAITOK)) {
353 			device_printf(sc->dev,
354 			    "failed to send IDENTIFY namespace %u command\n",
355 			    nsid);
356 			return (false);
357 		}
358 		nvmf_wait_for_reply(&status);
359 
360 		if (status.cqe.status != 0) {
361 			device_printf(sc->dev,
362 			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
363 			    le16toh(status.cqe.status));
364 			return (false);
365 		}
366 
367 		if (status.io_error != 0) {
368 			device_printf(sc->dev,
369 			    "IDENTIFY namespace %u failed with I/O error %d\n",
370 			    nsid, status.io_error);
371 			return (false);
372 		}
373 
374 		nvme_namespace_data_swapbytes(data);
375 		if (!cb(sc, nsid, data, cb_arg))
376 			return (false);
377 	}
378 
379 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
380 
381 	if (nsid >= 0xfffffffd)
382 		*nsidp = 0;
383 	else
384 		*nsidp = nsid + 1;
385 	return (true);
386 }
387 
388 static bool
389 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
390     void *cb_arg)
391 {
392 	struct nvme_namespace_data *data;
393 	struct nvme_ns_list *nslist;
394 	uint32_t nsid;
395 	bool retval;
396 
397 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
398 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
399 
400 	nsid = 0;
401 	retval = true;
402 	for (;;) {
403 		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
404 		    cb_arg)) {
405 			retval = false;
406 			break;
407 		}
408 		if (nsid == 0)
409 			break;
410 	}
411 
412 	free(data, M_NVMF);
413 	free(nslist, M_NVMF);
414 	return (retval);
415 }
416 
417 static bool
418 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
419     const struct nvme_namespace_data *data, void *arg __unused)
420 {
421 	if (sc->ns[nsid - 1] != NULL) {
422 		device_printf(sc->dev,
423 		    "duplicate namespace %u in active namespace list\n",
424 		    nsid);
425 		return (false);
426 	}
427 
428 	/*
429 	 * As in nvme_ns_construct, a size of zero indicates an
430 	 * invalid namespace.
431 	 */
432 	if (data->nsze == 0) {
433 		device_printf(sc->dev,
434 		    "ignoring active namespace %u with zero size\n", nsid);
435 		return (true);
436 	}
437 
438 	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
439 
440 	nvmf_sim_rescan_ns(sc, nsid);
441 	return (true);
442 }
443 
444 static bool
445 nvmf_add_namespaces(struct nvmf_softc *sc)
446 {
447 	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
448 	    M_WAITOK | M_ZERO);
449 	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
450 }
451 
452 static int
453 nvmf_attach(device_t dev)
454 {
455 	struct make_dev_args mda;
456 	struct nvmf_softc *sc = device_get_softc(dev);
457 	struct nvmf_ivars *ivars = device_get_ivars(dev);
458 	uint64_t val;
459 	u_int i;
460 	int error;
461 
462 	if (ivars == NULL)
463 		return (ENXIO);
464 
465 	sc->dev = dev;
466 	sc->trtype = ivars->hh->trtype;
467 	callout_init(&sc->ka_rx_timer, 1);
468 	callout_init(&sc->ka_tx_timer, 1);
469 	sx_init(&sc->connection_lock, "nvmf connection");
470 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
471 
472 	/* Claim the cdata pointer from ivars. */
473 	sc->cdata = ivars->cdata;
474 	ivars->cdata = NULL;
475 
476 	nvmf_init_aer(sc);
477 
478 	/* TODO: Multiqueue support. */
479 	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
480 
481 	error = nvmf_establish_connection(sc, ivars);
482 	if (error != 0)
483 		goto out;
484 
485 	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
486 	if (error != 0) {
487 		device_printf(sc->dev, "Failed to fetch CAP\n");
488 		error = ENXIO;
489 		goto out;
490 	}
491 
492 	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
493 	if (error != 0) {
494 		device_printf(sc->dev, "Failed to fetch VS\n");
495 		error = ENXIO;
496 		goto out;
497 	}
498 	sc->vs = val;
499 
500 	/* Honor MDTS if it is set. */
501 	sc->max_xfer_size = maxphys;
502 	if (sc->cdata->mdts != 0) {
503 		sc->max_xfer_size = ulmin(sc->max_xfer_size,
504 		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
505 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
506 	}
507 
508 	error = nvmf_init_sim(sc);
509 	if (error != 0)
510 		goto out;
511 
512 	error = nvmf_start_aer(sc);
513 	if (error != 0) {
514 		nvmf_destroy_sim(sc);
515 		goto out;
516 	}
517 
518 	if (!nvmf_add_namespaces(sc)) {
519 		nvmf_destroy_sim(sc);
520 		goto out;
521 	}
522 
523 	make_dev_args_init(&mda);
524 	mda.mda_devsw = &nvmf_cdevsw;
525 	mda.mda_uid = UID_ROOT;
526 	mda.mda_gid = GID_WHEEL;
527 	mda.mda_mode = 0600;
528 	mda.mda_si_drv1 = sc;
529 	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
530 	if (error != 0) {
531 		nvmf_destroy_sim(sc);
532 		goto out;
533 	}
534 
535 	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
536 	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
537 	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
538 	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST);
539 
540 	return (0);
541 out:
542 	if (sc->ns != NULL) {
543 		for (i = 0; i < sc->cdata->nn; i++) {
544 			if (sc->ns[i] != NULL)
545 				nvmf_destroy_ns(sc->ns[i]);
546 		}
547 		free(sc->ns, M_NVMF);
548 	}
549 
550 	callout_drain(&sc->ka_tx_timer);
551 	callout_drain(&sc->ka_rx_timer);
552 
553 	if (sc->admin != NULL)
554 		nvmf_shutdown_controller(sc);
555 
556 	for (i = 0; i < sc->num_io_queues; i++) {
557 		if (sc->io[i] != NULL)
558 			nvmf_destroy_qp(sc->io[i]);
559 	}
560 	free(sc->io, M_NVMF);
561 	if (sc->admin != NULL)
562 		nvmf_destroy_qp(sc->admin);
563 
564 	nvmf_destroy_aer(sc);
565 
566 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
567 	sx_destroy(&sc->connection_lock);
568 	free(sc->cdata, M_NVMF);
569 	return (error);
570 }
571 
572 void
573 nvmf_disconnect(struct nvmf_softc *sc)
574 {
575 	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
576 }
577 
578 static void
579 nvmf_disconnect_task(void *arg, int pending __unused)
580 {
581 	struct nvmf_softc *sc = arg;
582 	u_int i;
583 
584 	sx_xlock(&sc->connection_lock);
585 	if (sc->admin == NULL) {
586 		/*
587 		 * Ignore transport errors if there is no active
588 		 * association.
589 		 */
590 		sx_xunlock(&sc->connection_lock);
591 		return;
592 	}
593 
594 	if (sc->detaching) {
595 		if (sc->admin != NULL) {
596 			/*
597 			 * This unsticks the detach process if a
598 			 * transport error occurs during detach.
599 			 */
600 			nvmf_shutdown_qp(sc->admin);
601 		}
602 		sx_xunlock(&sc->connection_lock);
603 		return;
604 	}
605 
606 	if (sc->cdev == NULL) {
607 		/*
608 		 * Transport error occurred during attach (nvmf_add_namespaces).
609 		 * Shutdown the admin queue.
610 		 */
611 		nvmf_shutdown_qp(sc->admin);
612 		sx_xunlock(&sc->connection_lock);
613 		return;
614 	}
615 
616 	callout_drain(&sc->ka_tx_timer);
617 	callout_drain(&sc->ka_rx_timer);
618 	sc->ka_traffic = false;
619 
620 	/* Quiesce namespace consumers. */
621 	nvmf_disconnect_sim(sc);
622 	for (i = 0; i < sc->cdata->nn; i++) {
623 		if (sc->ns[i] != NULL)
624 			nvmf_disconnect_ns(sc->ns[i]);
625 	}
626 
627 	/* Shutdown the existing qpairs. */
628 	for (i = 0; i < sc->num_io_queues; i++) {
629 		nvmf_destroy_qp(sc->io[i]);
630 	}
631 	free(sc->io, M_NVMF);
632 	sc->io = NULL;
633 	sc->num_io_queues = 0;
634 	nvmf_destroy_qp(sc->admin);
635 	sc->admin = NULL;
636 
637 	sx_xunlock(&sc->connection_lock);
638 }
639 
640 static int
641 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
642 {
643 	struct nvmf_ivars ivars;
644 	u_int i;
645 	int error;
646 
647 	/* XXX: Should we permit changing the transport type? */
648 	if (sc->trtype != hh->trtype) {
649 		device_printf(sc->dev,
650 		    "transport type mismatch on reconnect\n");
651 		return (EINVAL);
652 	}
653 
654 	error = nvmf_init_ivars(&ivars, hh);
655 	if (error != 0)
656 		return (error);
657 
658 	sx_xlock(&sc->connection_lock);
659 	if (sc->admin != NULL || sc->detaching) {
660 		error = EBUSY;
661 		goto out;
662 	}
663 
664 	/*
665 	 * Ensure this is for the same controller.  Note that the
666 	 * controller ID can vary across associations if the remote
667 	 * system is using the dynamic controller model.  This merely
668 	 * ensures the new association is connected to the same NVMe
669 	 * subsystem.
670 	 */
671 	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
672 	    sizeof(ivars.cdata->subnqn)) != 0) {
673 		device_printf(sc->dev,
674 		    "controller subsystem NQN mismatch on reconnect\n");
675 		error = EINVAL;
676 		goto out;
677 	}
678 
679 	/*
680 	 * XXX: Require same number and size of I/O queues so that
681 	 * max_pending_io is still correct?
682 	 */
683 
684 	error = nvmf_establish_connection(sc, &ivars);
685 	if (error != 0)
686 		goto out;
687 
688 	error = nvmf_start_aer(sc);
689 	if (error != 0)
690 		goto out;
691 
692 	device_printf(sc->dev,
693 	    "established new association with %u I/O queues\n",
694 	    sc->num_io_queues);
695 
696 	/* Restart namespace consumers. */
697 	for (i = 0; i < sc->cdata->nn; i++) {
698 		if (sc->ns[i] != NULL)
699 			nvmf_reconnect_ns(sc->ns[i]);
700 	}
701 	nvmf_reconnect_sim(sc);
702 
703 	nvmf_rescan_all_ns(sc);
704 out:
705 	sx_xunlock(&sc->connection_lock);
706 	nvmf_free_ivars(&ivars);
707 	return (error);
708 }
709 
710 static void
711 nvmf_shutdown_pre_sync(void *arg, int howto)
712 {
713 	struct nvmf_softc *sc = arg;
714 
715 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
716 		return;
717 
718 	/*
719 	 * If this association is disconnected, abort any pending
720 	 * requests with an error to permit filesystems to unmount
721 	 * without hanging.
722 	 */
723 	sx_xlock(&sc->connection_lock);
724 	if (sc->admin != NULL || sc->detaching) {
725 		sx_xunlock(&sc->connection_lock);
726 		return;
727 	}
728 
729 	for (u_int i = 0; i < sc->cdata->nn; i++) {
730 		if (sc->ns[i] != NULL)
731 			nvmf_shutdown_ns(sc->ns[i]);
732 	}
733 	nvmf_shutdown_sim(sc);
734 	sx_xunlock(&sc->connection_lock);
735 }
736 
737 static void
738 nvmf_shutdown_post_sync(void *arg, int howto)
739 {
740 	struct nvmf_softc *sc = arg;
741 
742 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
743 		return;
744 
745 	/*
746 	 * If this association is connected, disconnect gracefully.
747 	 */
748 	sx_xlock(&sc->connection_lock);
749 	if (sc->admin == NULL || sc->detaching) {
750 		sx_xunlock(&sc->connection_lock);
751 		return;
752 	}
753 
754 	callout_drain(&sc->ka_tx_timer);
755 	callout_drain(&sc->ka_rx_timer);
756 
757 	nvmf_shutdown_controller(sc);
758 	for (u_int i = 0; i < sc->num_io_queues; i++) {
759 		nvmf_destroy_qp(sc->io[i]);
760 	}
761 	nvmf_destroy_qp(sc->admin);
762 	sc->admin = NULL;
763 	sx_xunlock(&sc->connection_lock);
764 }
765 
766 static int
767 nvmf_detach(device_t dev)
768 {
769 	struct nvmf_softc *sc = device_get_softc(dev);
770 	u_int i;
771 
772 	destroy_dev(sc->cdev);
773 
774 	sx_xlock(&sc->connection_lock);
775 	sc->detaching = true;
776 	sx_xunlock(&sc->connection_lock);
777 
778 	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
779 	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_post_sync_eh);
780 
781 	nvmf_destroy_sim(sc);
782 	for (i = 0; i < sc->cdata->nn; i++) {
783 		if (sc->ns[i] != NULL)
784 			nvmf_destroy_ns(sc->ns[i]);
785 	}
786 	free(sc->ns, M_NVMF);
787 
788 	callout_drain(&sc->ka_tx_timer);
789 	callout_drain(&sc->ka_rx_timer);
790 
791 	if (sc->admin != NULL)
792 		nvmf_shutdown_controller(sc);
793 
794 	for (i = 0; i < sc->num_io_queues; i++) {
795 		nvmf_destroy_qp(sc->io[i]);
796 	}
797 	free(sc->io, M_NVMF);
798 
799 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
800 
801 	if (sc->admin != NULL)
802 		nvmf_destroy_qp(sc->admin);
803 
804 	nvmf_destroy_aer(sc);
805 
806 	sx_destroy(&sc->connection_lock);
807 	free(sc->cdata, M_NVMF);
808 	return (0);
809 }
810 
811 static void
812 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
813     const struct nvme_namespace_data *data)
814 {
815 	struct nvmf_namespace *ns;
816 
817 	/* XXX: Needs locking around sc->ns[]. */
818 	ns = sc->ns[nsid - 1];
819 	if (data->nsze == 0) {
820 		/* XXX: Needs locking */
821 		if (ns != NULL) {
822 			nvmf_destroy_ns(ns);
823 			sc->ns[nsid - 1] = NULL;
824 		}
825 	} else {
826 		/* XXX: Needs locking */
827 		if (ns == NULL) {
828 			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
829 		} else {
830 			if (!nvmf_update_ns(ns, data)) {
831 				nvmf_destroy_ns(ns);
832 				sc->ns[nsid - 1] = NULL;
833 			}
834 		}
835 	}
836 
837 	nvmf_sim_rescan_ns(sc, nsid);
838 }
839 
840 void
841 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
842 {
843 	struct nvmf_completion_status status;
844 	struct nvme_namespace_data *data;
845 
846 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
847 
848 	nvmf_status_init(&status);
849 	nvmf_status_wait_io(&status);
850 	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
851 	    &status, nvmf_io_complete, &status, M_WAITOK)) {
852 		device_printf(sc->dev,
853 		    "failed to send IDENTIFY namespace %u command\n", nsid);
854 		free(data, M_NVMF);
855 		return;
856 	}
857 	nvmf_wait_for_reply(&status);
858 
859 	if (status.cqe.status != 0) {
860 		device_printf(sc->dev,
861 		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
862 		    le16toh(status.cqe.status));
863 		free(data, M_NVMF);
864 		return;
865 	}
866 
867 	if (status.io_error != 0) {
868 		device_printf(sc->dev,
869 		    "IDENTIFY namespace %u failed with I/O error %d\n",
870 		    nsid, status.io_error);
871 		free(data, M_NVMF);
872 		return;
873 	}
874 
875 	nvme_namespace_data_swapbytes(data);
876 
877 	nvmf_rescan_ns_1(sc, nsid, data);
878 
879 	free(data, M_NVMF);
880 }
881 
882 static void
883 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
884     uint32_t next_valid_nsid)
885 {
886 	struct nvmf_namespace *ns;
887 
888 	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
889 	{
890 		/* XXX: Needs locking around sc->ns[]. */
891 		ns = sc->ns[nsid - 1];
892 		if (ns != NULL) {
893 			nvmf_destroy_ns(ns);
894 			sc->ns[nsid - 1] = NULL;
895 
896 			nvmf_sim_rescan_ns(sc, nsid);
897 		}
898 	}
899 }
900 
901 static bool
902 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
903     const struct nvme_namespace_data *data, void *arg)
904 {
905 	uint32_t *last_nsid = arg;
906 
907 	/* Check for any gaps prior to this namespace. */
908 	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
909 	*last_nsid = nsid;
910 
911 	nvmf_rescan_ns_1(sc, nsid, data);
912 	return (true);
913 }
914 
915 void
916 nvmf_rescan_all_ns(struct nvmf_softc *sc)
917 {
918 	uint32_t last_nsid;
919 
920 	last_nsid = 0;
921 	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
922 		return;
923 
924 	/*
925 	 * Check for any namespace devices after the last active
926 	 * namespace.
927 	 */
928 	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
929 }
930 
931 int
932 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
933     bool admin)
934 {
935 	struct nvmf_completion_status status;
936 	struct nvme_command cmd;
937 	struct memdesc mem;
938 	struct nvmf_host_qpair *qp;
939 	struct nvmf_request *req;
940 	void *buf;
941 	int error;
942 
943 	if (pt->len > sc->max_xfer_size)
944 		return (EINVAL);
945 
946 	buf = NULL;
947 	if (pt->len != 0) {
948 		/*
949 		 * XXX: Depending on the size we may want to pin the
950 		 * user pages and use a memdesc with vm_page_t's
951 		 * instead.
952 		 */
953 		buf = malloc(pt->len, M_NVMF, M_WAITOK);
954 		if (pt->is_read == 0) {
955 			error = copyin(pt->buf, buf, pt->len);
956 			if (error != 0) {
957 				free(buf, M_NVMF);
958 				return (error);
959 			}
960 		} else {
961 			/* Ensure no kernel data is leaked to userland. */
962 			memset(buf, 0, pt->len);
963 		}
964 	}
965 
966 	memset(&cmd, 0, sizeof(cmd));
967 	cmd.opc = pt->cmd.opc;
968 	cmd.fuse = pt->cmd.fuse;
969 	cmd.nsid = pt->cmd.nsid;
970 	cmd.cdw10 = pt->cmd.cdw10;
971 	cmd.cdw11 = pt->cmd.cdw11;
972 	cmd.cdw12 = pt->cmd.cdw12;
973 	cmd.cdw13 = pt->cmd.cdw13;
974 	cmd.cdw14 = pt->cmd.cdw14;
975 	cmd.cdw15 = pt->cmd.cdw15;
976 
977 	if (admin)
978 		qp = sc->admin;
979 	else
980 		qp = nvmf_select_io_queue(sc);
981 	nvmf_status_init(&status);
982 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
983 	if (req == NULL) {
984 		device_printf(sc->dev, "failed to send passthrough command\n");
985 		error = ECONNABORTED;
986 		goto error;
987 	}
988 
989 	if (pt->len != 0) {
990 		mem = memdesc_vaddr(buf, pt->len);
991 		nvmf_capsule_append_data(req->nc, &mem, pt->len,
992 		    pt->is_read == 0, nvmf_io_complete, &status);
993 		nvmf_status_wait_io(&status);
994 	}
995 
996 	nvmf_submit_request(req);
997 	nvmf_wait_for_reply(&status);
998 
999 	memset(&pt->cpl, 0, sizeof(pt->cpl));
1000 	pt->cpl.cdw0 = status.cqe.cdw0;
1001 	pt->cpl.status = status.cqe.status;
1002 
1003 	error = status.io_error;
1004 	if (error == 0 && pt->len != 0 && pt->is_read != 0)
1005 		error = copyout(buf, pt->buf, pt->len);
1006 error:
1007 	free(buf, M_NVMF);
1008 	return (error);
1009 }
1010 
1011 static int
1012 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1013     struct thread *td)
1014 {
1015 	struct nvmf_softc *sc = cdev->si_drv1;
1016 	struct nvme_get_nsid *gnsid;
1017 	struct nvme_pt_command *pt;
1018 	struct nvmf_reconnect_params *rp;
1019 	struct nvmf_handoff_host *hh;
1020 
1021 	switch (cmd) {
1022 	case NVME_PASSTHROUGH_CMD:
1023 		pt = (struct nvme_pt_command *)arg;
1024 		return (nvmf_passthrough_cmd(sc, pt, true));
1025 	case NVME_GET_NSID:
1026 		gnsid = (struct nvme_get_nsid *)arg;
1027 		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1028 		    sizeof(gnsid->cdev));
1029 		gnsid->nsid = 0;
1030 		return (0);
1031 	case NVME_GET_MAX_XFER_SIZE:
1032 		*(uint64_t *)arg = sc->max_xfer_size;
1033 		return (0);
1034 	case NVMF_RECONNECT_PARAMS:
1035 		rp = (struct nvmf_reconnect_params *)arg;
1036 		if ((sc->cdata->fcatt & 1) == 0)
1037 			rp->cntlid = NVMF_CNTLID_DYNAMIC;
1038 		else
1039 			rp->cntlid = sc->cdata->ctrlr_id;
1040 		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
1041 		return (0);
1042 	case NVMF_RECONNECT_HOST:
1043 		hh = (struct nvmf_handoff_host *)arg;
1044 		return (nvmf_reconnect_host(sc, hh));
1045 	default:
1046 		return (ENOTTY);
1047 	}
1048 }
1049 
1050 static struct cdevsw nvmf_cdevsw = {
1051 	.d_version = D_VERSION,
1052 	.d_ioctl = nvmf_ioctl
1053 };
1054 
1055 static int
1056 nvmf_modevent(module_t mod, int what, void *arg)
1057 {
1058 	switch (what) {
1059 	case MOD_LOAD:
1060 		return (nvmf_ctl_load());
1061 	case MOD_QUIESCE:
1062 		return (0);
1063 	case MOD_UNLOAD:
1064 		nvmf_ctl_unload();
1065 		destroy_dev_drain(&nvmf_cdevsw);
1066 		return (0);
1067 	default:
1068 		return (EOPNOTSUPP);
1069 	}
1070 }
1071 
1072 static device_method_t nvmf_methods[] = {
1073 	/* Device interface */
1074 	DEVMETHOD(device_probe,     nvmf_probe),
1075 	DEVMETHOD(device_attach,    nvmf_attach),
1076 	DEVMETHOD(device_detach,    nvmf_detach),
1077 	DEVMETHOD_END
1078 };
1079 
1080 driver_t nvme_nvmf_driver = {
1081 	"nvme",
1082 	nvmf_methods,
1083 	sizeof(struct nvmf_softc),
1084 };
1085 
1086 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1087 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1088