xref: /freebsd/sys/dev/nvmf/host/nvmf.c (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/dnv.h>
12 #include <sys/eventhandler.h>
13 #include <sys/lock.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/module.h>
18 #include <sys/mutex.h>
19 #include <sys/nv.h>
20 #include <sys/reboot.h>
21 #include <sys/sx.h>
22 #include <sys/sysctl.h>
23 #include <sys/taskqueue.h>
24 #include <dev/nvme/nvme.h>
25 #include <dev/nvmf/nvmf.h>
26 #include <dev/nvmf/nvmf_transport.h>
27 #include <dev/nvmf/host/nvmf_var.h>
28 
29 static struct cdevsw nvmf_cdevsw;
30 
31 bool nvmf_fail_disconnect = false;
32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
33     &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
34 
35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
36 
37 static void	nvmf_disconnect_task(void *arg, int pending);
38 static void	nvmf_shutdown_pre_sync(void *arg, int howto);
39 static void	nvmf_shutdown_post_sync(void *arg, int howto);
40 
41 void
42 nvmf_complete(void *arg, const struct nvme_completion *cqe)
43 {
44 	struct nvmf_completion_status *status = arg;
45 	struct mtx *mtx;
46 
47 	status->cqe = *cqe;
48 	mtx = mtx_pool_find(mtxpool_sleep, status);
49 	mtx_lock(mtx);
50 	status->done = true;
51 	mtx_unlock(mtx);
52 	wakeup(status);
53 }
54 
55 void
56 nvmf_io_complete(void *arg, size_t xfered, int error)
57 {
58 	struct nvmf_completion_status *status = arg;
59 	struct mtx *mtx;
60 
61 	status->io_error = error;
62 	mtx = mtx_pool_find(mtxpool_sleep, status);
63 	mtx_lock(mtx);
64 	status->io_done = true;
65 	mtx_unlock(mtx);
66 	wakeup(status);
67 }
68 
69 void
70 nvmf_wait_for_reply(struct nvmf_completion_status *status)
71 {
72 	struct mtx *mtx;
73 
74 	mtx = mtx_pool_find(mtxpool_sleep, status);
75 	mtx_lock(mtx);
76 	while (!status->done || !status->io_done)
77 		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
78 	mtx_unlock(mtx);
79 }
80 
81 static int
82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
83     uint64_t *value)
84 {
85 	const struct nvmf_fabric_prop_get_rsp *rsp;
86 	struct nvmf_completion_status status;
87 
88 	nvmf_status_init(&status);
89 	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
90 	    M_WAITOK))
91 		return (ECONNABORTED);
92 	nvmf_wait_for_reply(&status);
93 
94 	if (status.cqe.status != 0) {
95 		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
96 		    le16toh(status.cqe.status));
97 		return (EIO);
98 	}
99 
100 	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
101 	if (size == 8)
102 		*value = le64toh(rsp->value.u64);
103 	else
104 		*value = le32toh(rsp->value.u32.low);
105 	return (0);
106 }
107 
108 static int
109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
110     uint64_t value)
111 {
112 	struct nvmf_completion_status status;
113 
114 	nvmf_status_init(&status);
115 	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
116 	    M_WAITOK))
117 		return (ECONNABORTED);
118 	nvmf_wait_for_reply(&status);
119 
120 	if (status.cqe.status != 0) {
121 		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
122 		    le16toh(status.cqe.status));
123 		return (EIO);
124 	}
125 	return (0);
126 }
127 
128 static void
129 nvmf_shutdown_controller(struct nvmf_softc *sc)
130 {
131 	uint64_t cc;
132 	int error;
133 
134 	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
135 	if (error != 0) {
136 		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
137 		return;
138 	}
139 
140 	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
141 
142 	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
143 	if (error != 0)
144 		device_printf(sc->dev,
145 		    "Failed to set CC to trigger shutdown\n");
146 }
147 
148 static void
149 nvmf_check_keep_alive(void *arg)
150 {
151 	struct nvmf_softc *sc = arg;
152 	int traffic;
153 
154 	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
155 	if (traffic == 0) {
156 		device_printf(sc->dev,
157 		    "disconnecting due to KeepAlive timeout\n");
158 		nvmf_disconnect(sc);
159 		return;
160 	}
161 
162 	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
163 }
164 
165 static void
166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
167 {
168 	struct nvmf_softc *sc = arg;
169 
170 	atomic_store_int(&sc->ka_active_rx_traffic, 1);
171 	if (cqe->status != 0) {
172 		device_printf(sc->dev,
173 		    "KeepAlive response reported status %#x\n",
174 		    le16toh(cqe->status));
175 	}
176 }
177 
178 static void
179 nvmf_send_keep_alive(void *arg)
180 {
181 	struct nvmf_softc *sc = arg;
182 	int traffic;
183 
184 	/*
185 	 * Don't bother sending a KeepAlive command if TKAS is active
186 	 * and another command has been sent during the interval.
187 	 */
188 	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
189 	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
190 	    sc, M_NOWAIT))
191 		device_printf(sc->dev,
192 		    "Failed to allocate KeepAlive command\n");
193 
194 	/* Clear ka_active_tx_traffic after sending the keep alive command. */
195 	atomic_store_int(&sc->ka_active_tx_traffic, 0);
196 
197 	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
198 }
199 
200 int
201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
202 {
203 	const nvlist_t *const *io;
204 	const nvlist_t *admin;
205 	nvlist_t *nvl;
206 	size_t i, num_io_queues;
207 	uint32_t qsize;
208 	int error;
209 
210 	error = nvmf_unpack_ioc_nvlist(nv, &nvl);
211 	if (error != 0)
212 		return (error);
213 
214 	if (!nvlist_exists_number(nvl, "trtype") ||
215 	    !nvlist_exists_nvlist(nvl, "admin") ||
216 	    !nvlist_exists_nvlist_array(nvl, "io") ||
217 	    !nvlist_exists_binary(nvl, "cdata"))
218 		goto invalid;
219 
220 	admin = nvlist_get_nvlist(nvl, "admin");
221 	if (!nvmf_validate_qpair_nvlist(admin, false))
222 		goto invalid;
223 	if (!nvlist_get_bool(admin, "admin"))
224 		goto invalid;
225 
226 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
227 	if (num_io_queues < 1)
228 		goto invalid;
229 	for (i = 0; i < num_io_queues; i++) {
230 		if (!nvmf_validate_qpair_nvlist(io[i], false))
231 			goto invalid;
232 	}
233 
234 	/* Require all I/O queues to be the same size. */
235 	qsize = nvlist_get_number(io[0], "qsize");
236 	for (i = 1; i < num_io_queues; i++) {
237 		if (nvlist_get_number(io[i], "qsize") != qsize)
238 			goto invalid;
239 	}
240 
241 	nvlist_get_binary(nvl, "cdata", &i);
242 	if (i != sizeof(struct nvme_controller_data))
243 		goto invalid;
244 
245 	*nvlp = nvl;
246 	return (0);
247 invalid:
248 	nvlist_destroy(nvl);
249 	return (EINVAL);
250 }
251 
252 static int
253 nvmf_probe(device_t dev)
254 {
255 	const nvlist_t *nvl = device_get_ivars(dev);
256 	const struct nvme_controller_data *cdata;
257 
258 	if (nvl == NULL)
259 		return (ENXIO);
260 
261 	cdata = nvlist_get_binary(nvl, "cdata", NULL);
262 	device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
263 	return (BUS_PROBE_DEFAULT);
264 }
265 
266 static int
267 nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl)
268 {
269 	const nvlist_t *const *io;
270 	const nvlist_t *admin;
271 	uint64_t kato;
272 	size_t num_io_queues;
273 	enum nvmf_trtype trtype;
274 	char name[16];
275 
276 	trtype = nvlist_get_number(nvl, "trtype");
277 	admin = nvlist_get_nvlist(nvl, "admin");
278 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
279 	kato = dnvlist_get_number(nvl, "kato", 0);
280 
281 	/* Setup the admin queue. */
282 	sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
283 	if (sc->admin == NULL) {
284 		device_printf(sc->dev, "Failed to setup admin queue\n");
285 		return (ENXIO);
286 	}
287 
288 	/* Setup I/O queues. */
289 	sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
290 	    M_WAITOK | M_ZERO);
291 	sc->num_io_queues = num_io_queues;
292 	for (u_int i = 0; i < sc->num_io_queues; i++) {
293 		snprintf(name, sizeof(name), "I/O queue %u", i);
294 		sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
295 		if (sc->io[i] == NULL) {
296 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
297 			    i + 1);
298 			return (ENXIO);
299 		}
300 	}
301 
302 	/* Start KeepAlive timers. */
303 	if (kato != 0) {
304 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
305 		    sc->cdata->ctratt) != 0;
306 		sc->ka_rx_sbt = mstosbt(kato);
307 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
308 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
309 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
310 		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
311 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
312 	}
313 
314 	memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
315 	    sizeof(*sc->cdata));
316 
317 	return (0);
318 }
319 
320 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
321     const struct nvme_namespace_data *, void *);
322 
323 static bool
324 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
325     struct nvme_namespace_data *data, uint32_t *nsidp,
326     nvmf_scan_active_ns_cb *cb, void *cb_arg)
327 {
328 	struct nvmf_completion_status status;
329 	uint32_t nsid;
330 
331 	nvmf_status_init(&status);
332 	nvmf_status_wait_io(&status);
333 	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
334 	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
335 		device_printf(sc->dev,
336 		    "failed to send IDENTIFY active namespaces command\n");
337 		return (false);
338 	}
339 	nvmf_wait_for_reply(&status);
340 
341 	if (status.cqe.status != 0) {
342 		device_printf(sc->dev,
343 		    "IDENTIFY active namespaces failed, status %#x\n",
344 		    le16toh(status.cqe.status));
345 		return (false);
346 	}
347 
348 	if (status.io_error != 0) {
349 		device_printf(sc->dev,
350 		    "IDENTIFY active namespaces failed with I/O error %d\n",
351 		    status.io_error);
352 		return (false);
353 	}
354 
355 	for (u_int i = 0; i < nitems(nslist->ns); i++) {
356 		nsid = nslist->ns[i];
357 		if (nsid == 0) {
358 			*nsidp = 0;
359 			return (true);
360 		}
361 
362 		nvmf_status_init(&status);
363 		nvmf_status_wait_io(&status);
364 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
365 		    &status, nvmf_io_complete, &status, M_WAITOK)) {
366 			device_printf(sc->dev,
367 			    "failed to send IDENTIFY namespace %u command\n",
368 			    nsid);
369 			return (false);
370 		}
371 		nvmf_wait_for_reply(&status);
372 
373 		if (status.cqe.status != 0) {
374 			device_printf(sc->dev,
375 			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
376 			    le16toh(status.cqe.status));
377 			return (false);
378 		}
379 
380 		if (status.io_error != 0) {
381 			device_printf(sc->dev,
382 			    "IDENTIFY namespace %u failed with I/O error %d\n",
383 			    nsid, status.io_error);
384 			return (false);
385 		}
386 
387 		nvme_namespace_data_swapbytes(data);
388 		if (!cb(sc, nsid, data, cb_arg))
389 			return (false);
390 	}
391 
392 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
393 
394 	if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
395 		*nsidp = 0;
396 	else
397 		*nsidp = nsid;
398 	return (true);
399 }
400 
401 static bool
402 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
403     void *cb_arg)
404 {
405 	struct nvme_namespace_data *data;
406 	struct nvme_ns_list *nslist;
407 	uint32_t nsid;
408 	bool retval;
409 
410 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
411 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
412 
413 	nsid = 0;
414 	retval = true;
415 	for (;;) {
416 		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
417 		    cb_arg)) {
418 			retval = false;
419 			break;
420 		}
421 		if (nsid == 0)
422 			break;
423 	}
424 
425 	free(data, M_NVMF);
426 	free(nslist, M_NVMF);
427 	return (retval);
428 }
429 
430 static bool
431 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
432     const struct nvme_namespace_data *data, void *arg __unused)
433 {
434 	if (sc->ns[nsid - 1] != NULL) {
435 		device_printf(sc->dev,
436 		    "duplicate namespace %u in active namespace list\n",
437 		    nsid);
438 		return (false);
439 	}
440 
441 	/*
442 	 * As in nvme_ns_construct, a size of zero indicates an
443 	 * invalid namespace.
444 	 */
445 	if (data->nsze == 0) {
446 		device_printf(sc->dev,
447 		    "ignoring active namespace %u with zero size\n", nsid);
448 		return (true);
449 	}
450 
451 	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
452 
453 	nvmf_sim_rescan_ns(sc, nsid);
454 	return (true);
455 }
456 
457 static bool
458 nvmf_add_namespaces(struct nvmf_softc *sc)
459 {
460 	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
461 	    M_WAITOK | M_ZERO);
462 	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
463 }
464 
465 static int
466 nvmf_attach(device_t dev)
467 {
468 	struct make_dev_args mda;
469 	struct nvmf_softc *sc = device_get_softc(dev);
470 	const nvlist_t *nvl = device_get_ivars(dev);
471 	const nvlist_t * const *io;
472 	struct sysctl_oid *oid;
473 	uint64_t val;
474 	u_int i;
475 	int error;
476 
477 	if (nvl == NULL)
478 		return (ENXIO);
479 
480 	sc->dev = dev;
481 	sc->trtype = nvlist_get_number(nvl, "trtype");
482 	callout_init(&sc->ka_rx_timer, 1);
483 	callout_init(&sc->ka_tx_timer, 1);
484 	sx_init(&sc->connection_lock, "nvmf connection");
485 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
486 
487 	oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
488 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
489 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
490 	sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
491 
492 	sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
493 
494 	nvmf_init_aer(sc);
495 
496 	error = nvmf_establish_connection(sc, nvl);
497 	if (error != 0)
498 		goto out;
499 
500 	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
501 	if (error != 0) {
502 		device_printf(sc->dev, "Failed to fetch CAP\n");
503 		error = ENXIO;
504 		goto out;
505 	}
506 
507 	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
508 	if (error != 0) {
509 		device_printf(sc->dev, "Failed to fetch VS\n");
510 		error = ENXIO;
511 		goto out;
512 	}
513 	sc->vs = val;
514 
515 	/* Honor MDTS if it is set. */
516 	sc->max_xfer_size = maxphys;
517 	if (sc->cdata->mdts != 0) {
518 		sc->max_xfer_size = ulmin(sc->max_xfer_size,
519 		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
520 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
521 	}
522 
523 	io = nvlist_get_nvlist_array(nvl, "io", NULL);
524 	sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
525 	    sc->num_io_queues;
526 
527 	error = nvmf_init_sim(sc);
528 	if (error != 0)
529 		goto out;
530 
531 	error = nvmf_start_aer(sc);
532 	if (error != 0) {
533 		nvmf_destroy_sim(sc);
534 		goto out;
535 	}
536 
537 	if (!nvmf_add_namespaces(sc)) {
538 		nvmf_destroy_sim(sc);
539 		goto out;
540 	}
541 
542 	make_dev_args_init(&mda);
543 	mda.mda_devsw = &nvmf_cdevsw;
544 	mda.mda_uid = UID_ROOT;
545 	mda.mda_gid = GID_WHEEL;
546 	mda.mda_mode = 0600;
547 	mda.mda_si_drv1 = sc;
548 	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
549 	if (error != 0) {
550 		nvmf_destroy_sim(sc);
551 		goto out;
552 	}
553 
554 	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
555 	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
556 	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
557 	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST);
558 
559 	return (0);
560 out:
561 	if (sc->ns != NULL) {
562 		for (i = 0; i < sc->cdata->nn; i++) {
563 			if (sc->ns[i] != NULL)
564 				nvmf_destroy_ns(sc->ns[i]);
565 		}
566 		free(sc->ns, M_NVMF);
567 	}
568 
569 	callout_drain(&sc->ka_tx_timer);
570 	callout_drain(&sc->ka_rx_timer);
571 
572 	if (sc->admin != NULL)
573 		nvmf_shutdown_controller(sc);
574 
575 	for (i = 0; i < sc->num_io_queues; i++) {
576 		if (sc->io[i] != NULL)
577 			nvmf_destroy_qp(sc->io[i]);
578 	}
579 	free(sc->io, M_NVMF);
580 	if (sc->admin != NULL)
581 		nvmf_destroy_qp(sc->admin);
582 
583 	nvmf_destroy_aer(sc);
584 
585 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
586 	sx_destroy(&sc->connection_lock);
587 	free(sc->cdata, M_NVMF);
588 	return (error);
589 }
590 
591 void
592 nvmf_disconnect(struct nvmf_softc *sc)
593 {
594 	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
595 }
596 
597 static void
598 nvmf_disconnect_task(void *arg, int pending __unused)
599 {
600 	struct nvmf_softc *sc = arg;
601 	u_int i;
602 
603 	sx_xlock(&sc->connection_lock);
604 	if (sc->admin == NULL) {
605 		/*
606 		 * Ignore transport errors if there is no active
607 		 * association.
608 		 */
609 		sx_xunlock(&sc->connection_lock);
610 		return;
611 	}
612 
613 	if (sc->detaching) {
614 		if (sc->admin != NULL) {
615 			/*
616 			 * This unsticks the detach process if a
617 			 * transport error occurs during detach.
618 			 */
619 			nvmf_shutdown_qp(sc->admin);
620 		}
621 		sx_xunlock(&sc->connection_lock);
622 		return;
623 	}
624 
625 	if (sc->cdev == NULL) {
626 		/*
627 		 * Transport error occurred during attach (nvmf_add_namespaces).
628 		 * Shutdown the admin queue.
629 		 */
630 		nvmf_shutdown_qp(sc->admin);
631 		sx_xunlock(&sc->connection_lock);
632 		return;
633 	}
634 
635 	callout_drain(&sc->ka_tx_timer);
636 	callout_drain(&sc->ka_rx_timer);
637 	sc->ka_traffic = false;
638 
639 	/* Quiesce namespace consumers. */
640 	nvmf_disconnect_sim(sc);
641 	for (i = 0; i < sc->cdata->nn; i++) {
642 		if (sc->ns[i] != NULL)
643 			nvmf_disconnect_ns(sc->ns[i]);
644 	}
645 
646 	/* Shutdown the existing qpairs. */
647 	for (i = 0; i < sc->num_io_queues; i++) {
648 		nvmf_destroy_qp(sc->io[i]);
649 	}
650 	free(sc->io, M_NVMF);
651 	sc->io = NULL;
652 	sc->num_io_queues = 0;
653 	nvmf_destroy_qp(sc->admin);
654 	sc->admin = NULL;
655 
656 	sx_xunlock(&sc->connection_lock);
657 }
658 
659 static int
660 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
661 {
662 	const struct nvme_controller_data *cdata;
663 	nvlist_t *nvl;
664 	u_int i;
665 	int error;
666 
667 	error = nvmf_copyin_handoff(nv, &nvl);
668 	if (error != 0)
669 		return (error);
670 
671 	/* XXX: Should we permit changing the transport type? */
672 	if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
673 		device_printf(sc->dev,
674 		    "transport type mismatch on reconnect\n");
675 		return (EINVAL);
676 	}
677 
678 	sx_xlock(&sc->connection_lock);
679 	if (sc->admin != NULL || sc->detaching) {
680 		error = EBUSY;
681 		goto out;
682 	}
683 
684 	/*
685 	 * Ensure this is for the same controller.  Note that the
686 	 * controller ID can vary across associations if the remote
687 	 * system is using the dynamic controller model.  This merely
688 	 * ensures the new association is connected to the same NVMe
689 	 * subsystem.
690 	 */
691 	cdata = nvlist_get_binary(nvl, "cdata", NULL);
692 	if (memcmp(sc->cdata->subnqn, cdata->subnqn,
693 	    sizeof(cdata->subnqn)) != 0) {
694 		device_printf(sc->dev,
695 		    "controller subsystem NQN mismatch on reconnect\n");
696 		error = EINVAL;
697 		goto out;
698 	}
699 
700 	/*
701 	 * XXX: Require same number and size of I/O queues so that
702 	 * max_pending_io is still correct?
703 	 */
704 
705 	error = nvmf_establish_connection(sc, nvl);
706 	if (error != 0)
707 		goto out;
708 
709 	error = nvmf_start_aer(sc);
710 	if (error != 0)
711 		goto out;
712 
713 	device_printf(sc->dev,
714 	    "established new association with %u I/O queues\n",
715 	    sc->num_io_queues);
716 
717 	/* Restart namespace consumers. */
718 	for (i = 0; i < sc->cdata->nn; i++) {
719 		if (sc->ns[i] != NULL)
720 			nvmf_reconnect_ns(sc->ns[i]);
721 	}
722 	nvmf_reconnect_sim(sc);
723 
724 	nvmf_rescan_all_ns(sc);
725 out:
726 	sx_xunlock(&sc->connection_lock);
727 	nvlist_destroy(nvl);
728 	return (error);
729 }
730 
731 static void
732 nvmf_shutdown_pre_sync(void *arg, int howto)
733 {
734 	struct nvmf_softc *sc = arg;
735 
736 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
737 		return;
738 
739 	/*
740 	 * If this association is disconnected, abort any pending
741 	 * requests with an error to permit filesystems to unmount
742 	 * without hanging.
743 	 */
744 	sx_xlock(&sc->connection_lock);
745 	if (sc->admin != NULL || sc->detaching) {
746 		sx_xunlock(&sc->connection_lock);
747 		return;
748 	}
749 
750 	for (u_int i = 0; i < sc->cdata->nn; i++) {
751 		if (sc->ns[i] != NULL)
752 			nvmf_shutdown_ns(sc->ns[i]);
753 	}
754 	nvmf_shutdown_sim(sc);
755 	sx_xunlock(&sc->connection_lock);
756 }
757 
758 static void
759 nvmf_shutdown_post_sync(void *arg, int howto)
760 {
761 	struct nvmf_softc *sc = arg;
762 
763 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
764 		return;
765 
766 	/*
767 	 * If this association is connected, disconnect gracefully.
768 	 */
769 	sx_xlock(&sc->connection_lock);
770 	if (sc->admin == NULL || sc->detaching) {
771 		sx_xunlock(&sc->connection_lock);
772 		return;
773 	}
774 
775 	callout_drain(&sc->ka_tx_timer);
776 	callout_drain(&sc->ka_rx_timer);
777 
778 	nvmf_shutdown_controller(sc);
779 	for (u_int i = 0; i < sc->num_io_queues; i++) {
780 		nvmf_destroy_qp(sc->io[i]);
781 	}
782 	nvmf_destroy_qp(sc->admin);
783 	sc->admin = NULL;
784 	sx_xunlock(&sc->connection_lock);
785 }
786 
787 static int
788 nvmf_detach(device_t dev)
789 {
790 	struct nvmf_softc *sc = device_get_softc(dev);
791 	u_int i;
792 
793 	destroy_dev(sc->cdev);
794 
795 	sx_xlock(&sc->connection_lock);
796 	sc->detaching = true;
797 	sx_xunlock(&sc->connection_lock);
798 
799 	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
800 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
801 
802 	nvmf_destroy_sim(sc);
803 	for (i = 0; i < sc->cdata->nn; i++) {
804 		if (sc->ns[i] != NULL)
805 			nvmf_destroy_ns(sc->ns[i]);
806 	}
807 	free(sc->ns, M_NVMF);
808 
809 	callout_drain(&sc->ka_tx_timer);
810 	callout_drain(&sc->ka_rx_timer);
811 
812 	if (sc->admin != NULL)
813 		nvmf_shutdown_controller(sc);
814 
815 	for (i = 0; i < sc->num_io_queues; i++) {
816 		nvmf_destroy_qp(sc->io[i]);
817 	}
818 	free(sc->io, M_NVMF);
819 
820 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
821 
822 	if (sc->admin != NULL)
823 		nvmf_destroy_qp(sc->admin);
824 
825 	nvmf_destroy_aer(sc);
826 
827 	sx_destroy(&sc->connection_lock);
828 	free(sc->cdata, M_NVMF);
829 	return (0);
830 }
831 
832 static void
833 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
834     const struct nvme_namespace_data *data)
835 {
836 	struct nvmf_namespace *ns;
837 
838 	/* XXX: Needs locking around sc->ns[]. */
839 	ns = sc->ns[nsid - 1];
840 	if (data->nsze == 0) {
841 		/* XXX: Needs locking */
842 		if (ns != NULL) {
843 			nvmf_destroy_ns(ns);
844 			sc->ns[nsid - 1] = NULL;
845 		}
846 	} else {
847 		/* XXX: Needs locking */
848 		if (ns == NULL) {
849 			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
850 		} else {
851 			if (!nvmf_update_ns(ns, data)) {
852 				nvmf_destroy_ns(ns);
853 				sc->ns[nsid - 1] = NULL;
854 			}
855 		}
856 	}
857 
858 	nvmf_sim_rescan_ns(sc, nsid);
859 }
860 
861 void
862 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
863 {
864 	struct nvmf_completion_status status;
865 	struct nvme_namespace_data *data;
866 
867 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
868 
869 	nvmf_status_init(&status);
870 	nvmf_status_wait_io(&status);
871 	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
872 	    &status, nvmf_io_complete, &status, M_WAITOK)) {
873 		device_printf(sc->dev,
874 		    "failed to send IDENTIFY namespace %u command\n", nsid);
875 		free(data, M_NVMF);
876 		return;
877 	}
878 	nvmf_wait_for_reply(&status);
879 
880 	if (status.cqe.status != 0) {
881 		device_printf(sc->dev,
882 		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
883 		    le16toh(status.cqe.status));
884 		free(data, M_NVMF);
885 		return;
886 	}
887 
888 	if (status.io_error != 0) {
889 		device_printf(sc->dev,
890 		    "IDENTIFY namespace %u failed with I/O error %d\n",
891 		    nsid, status.io_error);
892 		free(data, M_NVMF);
893 		return;
894 	}
895 
896 	nvme_namespace_data_swapbytes(data);
897 
898 	nvmf_rescan_ns_1(sc, nsid, data);
899 
900 	free(data, M_NVMF);
901 }
902 
903 static void
904 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
905     uint32_t next_valid_nsid)
906 {
907 	struct nvmf_namespace *ns;
908 
909 	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
910 	{
911 		/* XXX: Needs locking around sc->ns[]. */
912 		ns = sc->ns[nsid - 1];
913 		if (ns != NULL) {
914 			nvmf_destroy_ns(ns);
915 			sc->ns[nsid - 1] = NULL;
916 
917 			nvmf_sim_rescan_ns(sc, nsid);
918 		}
919 	}
920 }
921 
922 static bool
923 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
924     const struct nvme_namespace_data *data, void *arg)
925 {
926 	uint32_t *last_nsid = arg;
927 
928 	/* Check for any gaps prior to this namespace. */
929 	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
930 	*last_nsid = nsid;
931 
932 	nvmf_rescan_ns_1(sc, nsid, data);
933 	return (true);
934 }
935 
936 void
937 nvmf_rescan_all_ns(struct nvmf_softc *sc)
938 {
939 	uint32_t last_nsid;
940 
941 	last_nsid = 0;
942 	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
943 		return;
944 
945 	/*
946 	 * Check for any namespace devices after the last active
947 	 * namespace.
948 	 */
949 	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
950 }
951 
952 int
953 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
954     bool admin)
955 {
956 	struct nvmf_completion_status status;
957 	struct nvme_command cmd;
958 	struct memdesc mem;
959 	struct nvmf_host_qpair *qp;
960 	struct nvmf_request *req;
961 	void *buf;
962 	int error;
963 
964 	if (pt->len > sc->max_xfer_size)
965 		return (EINVAL);
966 
967 	buf = NULL;
968 	if (pt->len != 0) {
969 		/*
970 		 * XXX: Depending on the size we may want to pin the
971 		 * user pages and use a memdesc with vm_page_t's
972 		 * instead.
973 		 */
974 		buf = malloc(pt->len, M_NVMF, M_WAITOK);
975 		if (pt->is_read == 0) {
976 			error = copyin(pt->buf, buf, pt->len);
977 			if (error != 0) {
978 				free(buf, M_NVMF);
979 				return (error);
980 			}
981 		} else {
982 			/* Ensure no kernel data is leaked to userland. */
983 			memset(buf, 0, pt->len);
984 		}
985 	}
986 
987 	memset(&cmd, 0, sizeof(cmd));
988 	cmd.opc = pt->cmd.opc;
989 	cmd.fuse = pt->cmd.fuse;
990 	cmd.nsid = pt->cmd.nsid;
991 	cmd.cdw10 = pt->cmd.cdw10;
992 	cmd.cdw11 = pt->cmd.cdw11;
993 	cmd.cdw12 = pt->cmd.cdw12;
994 	cmd.cdw13 = pt->cmd.cdw13;
995 	cmd.cdw14 = pt->cmd.cdw14;
996 	cmd.cdw15 = pt->cmd.cdw15;
997 
998 	sx_slock(&sc->connection_lock);
999 	if (sc->admin == NULL || sc->detaching) {
1000 		device_printf(sc->dev,
1001 		    "failed to send passthrough command\n");
1002 		error = ECONNABORTED;
1003 		sx_sunlock(&sc->connection_lock);
1004 		goto error;
1005 	}
1006 	if (admin)
1007 		qp = sc->admin;
1008 	else
1009 		qp = nvmf_select_io_queue(sc);
1010 	nvmf_status_init(&status);
1011 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
1012 	sx_sunlock(&sc->connection_lock);
1013 	if (req == NULL) {
1014 		device_printf(sc->dev, "failed to send passthrough command\n");
1015 		error = ECONNABORTED;
1016 		goto error;
1017 	}
1018 
1019 	if (pt->len != 0) {
1020 		mem = memdesc_vaddr(buf, pt->len);
1021 		nvmf_capsule_append_data(req->nc, &mem, pt->len,
1022 		    pt->is_read == 0, nvmf_io_complete, &status);
1023 		nvmf_status_wait_io(&status);
1024 	}
1025 
1026 	nvmf_submit_request(req);
1027 	nvmf_wait_for_reply(&status);
1028 
1029 	memset(&pt->cpl, 0, sizeof(pt->cpl));
1030 	pt->cpl.cdw0 = status.cqe.cdw0;
1031 	pt->cpl.status = status.cqe.status;
1032 
1033 	error = status.io_error;
1034 	if (error == 0 && pt->len != 0 && pt->is_read != 0)
1035 		error = copyout(buf, pt->buf, pt->len);
1036 error:
1037 	free(buf, M_NVMF);
1038 	return (error);
1039 }
1040 
1041 static int
1042 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1043 {
1044 	nvlist_t *nvl;
1045 	int error;
1046 
1047 	nvl = nvlist_create(0);
1048 
1049 	sx_slock(&sc->connection_lock);
1050 	if ((sc->cdata->fcatt & 1) == 0)
1051 		nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC);
1052 	else
1053 		nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id);
1054 	nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn);
1055 	sx_sunlock(&sc->connection_lock);
1056 
1057 	error = nvmf_pack_ioc_nvlist(nvl, nv);
1058 	nvlist_destroy(nvl);
1059 	return (error);
1060 }
1061 
1062 static int
1063 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1064     struct thread *td)
1065 {
1066 	struct nvmf_softc *sc = cdev->si_drv1;
1067 	struct nvme_get_nsid *gnsid;
1068 	struct nvme_pt_command *pt;
1069 	struct nvmf_ioc_nv *nv;
1070 
1071 	switch (cmd) {
1072 	case NVME_PASSTHROUGH_CMD:
1073 		pt = (struct nvme_pt_command *)arg;
1074 		return (nvmf_passthrough_cmd(sc, pt, true));
1075 	case NVME_GET_NSID:
1076 		gnsid = (struct nvme_get_nsid *)arg;
1077 		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1078 		    sizeof(gnsid->cdev));
1079 		gnsid->nsid = 0;
1080 		return (0);
1081 	case NVME_GET_MAX_XFER_SIZE:
1082 		*(uint64_t *)arg = sc->max_xfer_size;
1083 		return (0);
1084 	case NVMF_RECONNECT_PARAMS:
1085 		nv = (struct nvmf_ioc_nv *)arg;
1086 		return (nvmf_reconnect_params(sc, nv));
1087 	case NVMF_RECONNECT_HOST:
1088 		nv = (struct nvmf_ioc_nv *)arg;
1089 		return (nvmf_reconnect_host(sc, nv));
1090 	default:
1091 		return (ENOTTY);
1092 	}
1093 }
1094 
1095 static struct cdevsw nvmf_cdevsw = {
1096 	.d_version = D_VERSION,
1097 	.d_ioctl = nvmf_ioctl
1098 };
1099 
1100 static int
1101 nvmf_modevent(module_t mod, int what, void *arg)
1102 {
1103 	switch (what) {
1104 	case MOD_LOAD:
1105 		return (nvmf_ctl_load());
1106 	case MOD_QUIESCE:
1107 		return (0);
1108 	case MOD_UNLOAD:
1109 		nvmf_ctl_unload();
1110 		destroy_dev_drain(&nvmf_cdevsw);
1111 		return (0);
1112 	default:
1113 		return (EOPNOTSUPP);
1114 	}
1115 }
1116 
1117 static device_method_t nvmf_methods[] = {
1118 	/* Device interface */
1119 	DEVMETHOD(device_probe,     nvmf_probe),
1120 	DEVMETHOD(device_attach,    nvmf_attach),
1121 	DEVMETHOD(device_detach,    nvmf_detach),
1122 	DEVMETHOD_END
1123 };
1124 
1125 driver_t nvme_nvmf_driver = {
1126 	"nvme",
1127 	nvmf_methods,
1128 	sizeof(struct nvmf_softc),
1129 };
1130 
1131 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1132 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1133