xref: /freebsd/sys/dev/nvmf/host/nvmf.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/eventhandler.h>
12 #include <sys/lock.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/memdesc.h>
16 #include <sys/module.h>
17 #include <sys/mutex.h>
18 #include <sys/reboot.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/taskqueue.h>
22 #include <dev/nvme/nvme.h>
23 #include <dev/nvmf/nvmf.h>
24 #include <dev/nvmf/nvmf_transport.h>
25 #include <dev/nvmf/host/nvmf_var.h>
26 
27 static struct cdevsw nvmf_cdevsw;
28 
29 bool nvmf_fail_disconnect = false;
30 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
31     &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
32 
33 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
34 
35 static void	nvmf_disconnect_task(void *arg, int pending);
36 static void	nvmf_shutdown_pre_sync(void *arg, int howto);
37 static void	nvmf_shutdown_post_sync(void *arg, int howto);
38 
39 void
40 nvmf_complete(void *arg, const struct nvme_completion *cqe)
41 {
42 	struct nvmf_completion_status *status = arg;
43 	struct mtx *mtx;
44 
45 	status->cqe = *cqe;
46 	mtx = mtx_pool_find(mtxpool_sleep, status);
47 	mtx_lock(mtx);
48 	status->done = true;
49 	mtx_unlock(mtx);
50 	wakeup(status);
51 }
52 
53 void
54 nvmf_io_complete(void *arg, size_t xfered, int error)
55 {
56 	struct nvmf_completion_status *status = arg;
57 	struct mtx *mtx;
58 
59 	status->io_error = error;
60 	mtx = mtx_pool_find(mtxpool_sleep, status);
61 	mtx_lock(mtx);
62 	status->io_done = true;
63 	mtx_unlock(mtx);
64 	wakeup(status);
65 }
66 
67 void
68 nvmf_wait_for_reply(struct nvmf_completion_status *status)
69 {
70 	struct mtx *mtx;
71 
72 	mtx = mtx_pool_find(mtxpool_sleep, status);
73 	mtx_lock(mtx);
74 	while (!status->done || !status->io_done)
75 		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
76 	mtx_unlock(mtx);
77 }
78 
79 static int
80 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
81     uint64_t *value)
82 {
83 	const struct nvmf_fabric_prop_get_rsp *rsp;
84 	struct nvmf_completion_status status;
85 
86 	nvmf_status_init(&status);
87 	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
88 	    M_WAITOK))
89 		return (ECONNABORTED);
90 	nvmf_wait_for_reply(&status);
91 
92 	if (status.cqe.status != 0) {
93 		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
94 		    le16toh(status.cqe.status));
95 		return (EIO);
96 	}
97 
98 	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
99 	if (size == 8)
100 		*value = le64toh(rsp->value.u64);
101 	else
102 		*value = le32toh(rsp->value.u32.low);
103 	return (0);
104 }
105 
106 static int
107 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
108     uint64_t value)
109 {
110 	struct nvmf_completion_status status;
111 
112 	nvmf_status_init(&status);
113 	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
114 	    M_WAITOK))
115 		return (ECONNABORTED);
116 	nvmf_wait_for_reply(&status);
117 
118 	if (status.cqe.status != 0) {
119 		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
120 		    le16toh(status.cqe.status));
121 		return (EIO);
122 	}
123 	return (0);
124 }
125 
126 static void
127 nvmf_shutdown_controller(struct nvmf_softc *sc)
128 {
129 	uint64_t cc;
130 	int error;
131 
132 	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
133 	if (error != 0) {
134 		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
135 		return;
136 	}
137 
138 	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
139 
140 	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
141 	if (error != 0)
142 		device_printf(sc->dev,
143 		    "Failed to set CC to trigger shutdown\n");
144 }
145 
146 static void
147 nvmf_check_keep_alive(void *arg)
148 {
149 	struct nvmf_softc *sc = arg;
150 	int traffic;
151 
152 	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
153 	if (traffic == 0) {
154 		device_printf(sc->dev,
155 		    "disconnecting due to KeepAlive timeout\n");
156 		nvmf_disconnect(sc);
157 		return;
158 	}
159 
160 	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
161 }
162 
163 static void
164 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
165 {
166 	struct nvmf_softc *sc = arg;
167 
168 	atomic_store_int(&sc->ka_active_rx_traffic, 1);
169 	if (cqe->status != 0) {
170 		device_printf(sc->dev,
171 		    "KeepAlive response reported status %#x\n",
172 		    le16toh(cqe->status));
173 	}
174 }
175 
176 static void
177 nvmf_send_keep_alive(void *arg)
178 {
179 	struct nvmf_softc *sc = arg;
180 	int traffic;
181 
182 	/*
183 	 * Don't bother sending a KeepAlive command if TKAS is active
184 	 * and another command has been sent during the interval.
185 	 */
186 	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
187 	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
188 	    sc, M_NOWAIT))
189 		device_printf(sc->dev,
190 		    "Failed to allocate KeepAlive command\n");
191 
192 	/* Clear ka_active_tx_traffic after sending the keep alive command. */
193 	atomic_store_int(&sc->ka_active_tx_traffic, 0);
194 
195 	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
196 }
197 
198 int
199 nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
200 {
201 	size_t len;
202 	u_int i;
203 	int error;
204 
205 	memset(ivars, 0, sizeof(*ivars));
206 
207 	if (!hh->admin.admin || hh->num_io_queues < 1)
208 		return (EINVAL);
209 
210 	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
211 	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
212 	if (error != 0)
213 		goto out;
214 	nvme_controller_data_swapbytes(ivars->cdata);
215 
216 	len = hh->num_io_queues * sizeof(*ivars->io_params);
217 	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
218 	error = copyin(hh->io, ivars->io_params, len);
219 	if (error != 0)
220 		goto out;
221 	for (i = 0; i < hh->num_io_queues; i++) {
222 		if (ivars->io_params[i].admin) {
223 			error = EINVAL;
224 			goto out;
225 		}
226 
227 		/* Require all I/O queues to be the same size. */
228 		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
229 			error = EINVAL;
230 			goto out;
231 		}
232 	}
233 
234 	ivars->hh = hh;
235 	return (0);
236 
237 out:
238 	free(ivars->io_params, M_NVMF);
239 	free(ivars->cdata, M_NVMF);
240 	return (error);
241 }
242 
243 void
244 nvmf_free_ivars(struct nvmf_ivars *ivars)
245 {
246 	free(ivars->io_params, M_NVMF);
247 	free(ivars->cdata, M_NVMF);
248 }
249 
250 static int
251 nvmf_probe(device_t dev)
252 {
253 	struct nvmf_ivars *ivars = device_get_ivars(dev);
254 
255 	if (ivars == NULL)
256 		return (ENXIO);
257 
258 	device_set_descf(dev, "Fabrics: %.256s", ivars->cdata->subnqn);
259 	return (BUS_PROBE_DEFAULT);
260 }
261 
262 static int
263 nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
264 {
265 	char name[16];
266 
267 	/* Setup the admin queue. */
268 	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
269 	    "admin queue", 0);
270 	if (sc->admin == NULL) {
271 		device_printf(sc->dev, "Failed to setup admin queue\n");
272 		return (ENXIO);
273 	}
274 
275 	/* Setup I/O queues. */
276 	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
277 	    M_WAITOK | M_ZERO);
278 	sc->num_io_queues = ivars->hh->num_io_queues;
279 	for (u_int i = 0; i < sc->num_io_queues; i++) {
280 		snprintf(name, sizeof(name), "I/O queue %u", i);
281 		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
282 		    &ivars->io_params[i], name, i);
283 		if (sc->io[i] == NULL) {
284 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
285 			    i + 1);
286 			return (ENXIO);
287 		}
288 	}
289 
290 	/* Start KeepAlive timers. */
291 	if (ivars->hh->kato != 0) {
292 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
293 		    sc->cdata->ctratt) != 0;
294 		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
295 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
296 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
297 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
298 		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
299 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
300 	}
301 
302 	return (0);
303 }
304 
305 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
306     const struct nvme_namespace_data *, void *);
307 
308 static bool
309 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
310     struct nvme_namespace_data *data, uint32_t *nsidp,
311     nvmf_scan_active_ns_cb *cb, void *cb_arg)
312 {
313 	struct nvmf_completion_status status;
314 	uint32_t nsid;
315 
316 	nvmf_status_init(&status);
317 	nvmf_status_wait_io(&status);
318 	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
319 	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
320 		device_printf(sc->dev,
321 		    "failed to send IDENTIFY active namespaces command\n");
322 		return (false);
323 	}
324 	nvmf_wait_for_reply(&status);
325 
326 	if (status.cqe.status != 0) {
327 		device_printf(sc->dev,
328 		    "IDENTIFY active namespaces failed, status %#x\n",
329 		    le16toh(status.cqe.status));
330 		return (false);
331 	}
332 
333 	if (status.io_error != 0) {
334 		device_printf(sc->dev,
335 		    "IDENTIFY active namespaces failed with I/O error %d\n",
336 		    status.io_error);
337 		return (false);
338 	}
339 
340 	for (u_int i = 0; i < nitems(nslist->ns); i++) {
341 		nsid = nslist->ns[i];
342 		if (nsid == 0) {
343 			*nsidp = 0;
344 			return (true);
345 		}
346 
347 		nvmf_status_init(&status);
348 		nvmf_status_wait_io(&status);
349 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
350 		    &status, nvmf_io_complete, &status, M_WAITOK)) {
351 			device_printf(sc->dev,
352 			    "failed to send IDENTIFY namespace %u command\n",
353 			    nsid);
354 			return (false);
355 		}
356 		nvmf_wait_for_reply(&status);
357 
358 		if (status.cqe.status != 0) {
359 			device_printf(sc->dev,
360 			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
361 			    le16toh(status.cqe.status));
362 			return (false);
363 		}
364 
365 		if (status.io_error != 0) {
366 			device_printf(sc->dev,
367 			    "IDENTIFY namespace %u failed with I/O error %d\n",
368 			    nsid, status.io_error);
369 			return (false);
370 		}
371 
372 		nvme_namespace_data_swapbytes(data);
373 		if (!cb(sc, nsid, data, cb_arg))
374 			return (false);
375 	}
376 
377 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
378 
379 	if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
380 		*nsidp = 0;
381 	else
382 		*nsidp = nsid;
383 	return (true);
384 }
385 
386 static bool
387 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
388     void *cb_arg)
389 {
390 	struct nvme_namespace_data *data;
391 	struct nvme_ns_list *nslist;
392 	uint32_t nsid;
393 	bool retval;
394 
395 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
396 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
397 
398 	nsid = 0;
399 	retval = true;
400 	for (;;) {
401 		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
402 		    cb_arg)) {
403 			retval = false;
404 			break;
405 		}
406 		if (nsid == 0)
407 			break;
408 	}
409 
410 	free(data, M_NVMF);
411 	free(nslist, M_NVMF);
412 	return (retval);
413 }
414 
415 static bool
416 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
417     const struct nvme_namespace_data *data, void *arg __unused)
418 {
419 	if (sc->ns[nsid - 1] != NULL) {
420 		device_printf(sc->dev,
421 		    "duplicate namespace %u in active namespace list\n",
422 		    nsid);
423 		return (false);
424 	}
425 
426 	/*
427 	 * As in nvme_ns_construct, a size of zero indicates an
428 	 * invalid namespace.
429 	 */
430 	if (data->nsze == 0) {
431 		device_printf(sc->dev,
432 		    "ignoring active namespace %u with zero size\n", nsid);
433 		return (true);
434 	}
435 
436 	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
437 
438 	nvmf_sim_rescan_ns(sc, nsid);
439 	return (true);
440 }
441 
442 static bool
443 nvmf_add_namespaces(struct nvmf_softc *sc)
444 {
445 	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
446 	    M_WAITOK | M_ZERO);
447 	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
448 }
449 
450 static int
451 nvmf_attach(device_t dev)
452 {
453 	struct make_dev_args mda;
454 	struct nvmf_softc *sc = device_get_softc(dev);
455 	struct nvmf_ivars *ivars = device_get_ivars(dev);
456 	struct sysctl_oid *oid;
457 	uint64_t val;
458 	u_int i;
459 	int error;
460 
461 	if (ivars == NULL)
462 		return (ENXIO);
463 
464 	sc->dev = dev;
465 	sc->trtype = ivars->hh->trtype;
466 	callout_init(&sc->ka_rx_timer, 1);
467 	callout_init(&sc->ka_tx_timer, 1);
468 	sx_init(&sc->connection_lock, "nvmf connection");
469 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
470 
471 	oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
472 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
473 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
474 	sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
475 
476 	/* Claim the cdata pointer from ivars. */
477 	sc->cdata = ivars->cdata;
478 	ivars->cdata = NULL;
479 
480 	nvmf_init_aer(sc);
481 
482 	error = nvmf_establish_connection(sc, ivars);
483 	if (error != 0)
484 		goto out;
485 
486 	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
487 	if (error != 0) {
488 		device_printf(sc->dev, "Failed to fetch CAP\n");
489 		error = ENXIO;
490 		goto out;
491 	}
492 
493 	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
494 	if (error != 0) {
495 		device_printf(sc->dev, "Failed to fetch VS\n");
496 		error = ENXIO;
497 		goto out;
498 	}
499 	sc->vs = val;
500 
501 	/* Honor MDTS if it is set. */
502 	sc->max_xfer_size = maxphys;
503 	if (sc->cdata->mdts != 0) {
504 		sc->max_xfer_size = ulmin(sc->max_xfer_size,
505 		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
506 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
507 	}
508 
509 	sc->max_pending_io = ivars->io_params[0].qsize * sc->num_io_queues;
510 
511 	error = nvmf_init_sim(sc);
512 	if (error != 0)
513 		goto out;
514 
515 	error = nvmf_start_aer(sc);
516 	if (error != 0) {
517 		nvmf_destroy_sim(sc);
518 		goto out;
519 	}
520 
521 	if (!nvmf_add_namespaces(sc)) {
522 		nvmf_destroy_sim(sc);
523 		goto out;
524 	}
525 
526 	make_dev_args_init(&mda);
527 	mda.mda_devsw = &nvmf_cdevsw;
528 	mda.mda_uid = UID_ROOT;
529 	mda.mda_gid = GID_WHEEL;
530 	mda.mda_mode = 0600;
531 	mda.mda_si_drv1 = sc;
532 	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
533 	if (error != 0) {
534 		nvmf_destroy_sim(sc);
535 		goto out;
536 	}
537 
538 	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
539 	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
540 	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
541 	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST);
542 
543 	return (0);
544 out:
545 	if (sc->ns != NULL) {
546 		for (i = 0; i < sc->cdata->nn; i++) {
547 			if (sc->ns[i] != NULL)
548 				nvmf_destroy_ns(sc->ns[i]);
549 		}
550 		free(sc->ns, M_NVMF);
551 	}
552 
553 	callout_drain(&sc->ka_tx_timer);
554 	callout_drain(&sc->ka_rx_timer);
555 
556 	if (sc->admin != NULL)
557 		nvmf_shutdown_controller(sc);
558 
559 	for (i = 0; i < sc->num_io_queues; i++) {
560 		if (sc->io[i] != NULL)
561 			nvmf_destroy_qp(sc->io[i]);
562 	}
563 	free(sc->io, M_NVMF);
564 	if (sc->admin != NULL)
565 		nvmf_destroy_qp(sc->admin);
566 
567 	nvmf_destroy_aer(sc);
568 
569 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
570 	sx_destroy(&sc->connection_lock);
571 	free(sc->cdata, M_NVMF);
572 	return (error);
573 }
574 
575 void
576 nvmf_disconnect(struct nvmf_softc *sc)
577 {
578 	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
579 }
580 
581 static void
582 nvmf_disconnect_task(void *arg, int pending __unused)
583 {
584 	struct nvmf_softc *sc = arg;
585 	u_int i;
586 
587 	sx_xlock(&sc->connection_lock);
588 	if (sc->admin == NULL) {
589 		/*
590 		 * Ignore transport errors if there is no active
591 		 * association.
592 		 */
593 		sx_xunlock(&sc->connection_lock);
594 		return;
595 	}
596 
597 	if (sc->detaching) {
598 		if (sc->admin != NULL) {
599 			/*
600 			 * This unsticks the detach process if a
601 			 * transport error occurs during detach.
602 			 */
603 			nvmf_shutdown_qp(sc->admin);
604 		}
605 		sx_xunlock(&sc->connection_lock);
606 		return;
607 	}
608 
609 	if (sc->cdev == NULL) {
610 		/*
611 		 * Transport error occurred during attach (nvmf_add_namespaces).
612 		 * Shutdown the admin queue.
613 		 */
614 		nvmf_shutdown_qp(sc->admin);
615 		sx_xunlock(&sc->connection_lock);
616 		return;
617 	}
618 
619 	callout_drain(&sc->ka_tx_timer);
620 	callout_drain(&sc->ka_rx_timer);
621 	sc->ka_traffic = false;
622 
623 	/* Quiesce namespace consumers. */
624 	nvmf_disconnect_sim(sc);
625 	for (i = 0; i < sc->cdata->nn; i++) {
626 		if (sc->ns[i] != NULL)
627 			nvmf_disconnect_ns(sc->ns[i]);
628 	}
629 
630 	/* Shutdown the existing qpairs. */
631 	for (i = 0; i < sc->num_io_queues; i++) {
632 		nvmf_destroy_qp(sc->io[i]);
633 	}
634 	free(sc->io, M_NVMF);
635 	sc->io = NULL;
636 	sc->num_io_queues = 0;
637 	nvmf_destroy_qp(sc->admin);
638 	sc->admin = NULL;
639 
640 	sx_xunlock(&sc->connection_lock);
641 }
642 
643 static int
644 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
645 {
646 	struct nvmf_ivars ivars;
647 	u_int i;
648 	int error;
649 
650 	/* XXX: Should we permit changing the transport type? */
651 	if (sc->trtype != hh->trtype) {
652 		device_printf(sc->dev,
653 		    "transport type mismatch on reconnect\n");
654 		return (EINVAL);
655 	}
656 
657 	error = nvmf_init_ivars(&ivars, hh);
658 	if (error != 0)
659 		return (error);
660 
661 	sx_xlock(&sc->connection_lock);
662 	if (sc->admin != NULL || sc->detaching) {
663 		error = EBUSY;
664 		goto out;
665 	}
666 
667 	/*
668 	 * Ensure this is for the same controller.  Note that the
669 	 * controller ID can vary across associations if the remote
670 	 * system is using the dynamic controller model.  This merely
671 	 * ensures the new association is connected to the same NVMe
672 	 * subsystem.
673 	 */
674 	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
675 	    sizeof(ivars.cdata->subnqn)) != 0) {
676 		device_printf(sc->dev,
677 		    "controller subsystem NQN mismatch on reconnect\n");
678 		error = EINVAL;
679 		goto out;
680 	}
681 
682 	/*
683 	 * XXX: Require same number and size of I/O queues so that
684 	 * max_pending_io is still correct?
685 	 */
686 
687 	error = nvmf_establish_connection(sc, &ivars);
688 	if (error != 0)
689 		goto out;
690 
691 	error = nvmf_start_aer(sc);
692 	if (error != 0)
693 		goto out;
694 
695 	device_printf(sc->dev,
696 	    "established new association with %u I/O queues\n",
697 	    sc->num_io_queues);
698 
699 	/* Restart namespace consumers. */
700 	for (i = 0; i < sc->cdata->nn; i++) {
701 		if (sc->ns[i] != NULL)
702 			nvmf_reconnect_ns(sc->ns[i]);
703 	}
704 	nvmf_reconnect_sim(sc);
705 
706 	nvmf_rescan_all_ns(sc);
707 out:
708 	sx_xunlock(&sc->connection_lock);
709 	nvmf_free_ivars(&ivars);
710 	return (error);
711 }
712 
713 static void
714 nvmf_shutdown_pre_sync(void *arg, int howto)
715 {
716 	struct nvmf_softc *sc = arg;
717 
718 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
719 		return;
720 
721 	/*
722 	 * If this association is disconnected, abort any pending
723 	 * requests with an error to permit filesystems to unmount
724 	 * without hanging.
725 	 */
726 	sx_xlock(&sc->connection_lock);
727 	if (sc->admin != NULL || sc->detaching) {
728 		sx_xunlock(&sc->connection_lock);
729 		return;
730 	}
731 
732 	for (u_int i = 0; i < sc->cdata->nn; i++) {
733 		if (sc->ns[i] != NULL)
734 			nvmf_shutdown_ns(sc->ns[i]);
735 	}
736 	nvmf_shutdown_sim(sc);
737 	sx_xunlock(&sc->connection_lock);
738 }
739 
740 static void
741 nvmf_shutdown_post_sync(void *arg, int howto)
742 {
743 	struct nvmf_softc *sc = arg;
744 
745 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
746 		return;
747 
748 	/*
749 	 * If this association is connected, disconnect gracefully.
750 	 */
751 	sx_xlock(&sc->connection_lock);
752 	if (sc->admin == NULL || sc->detaching) {
753 		sx_xunlock(&sc->connection_lock);
754 		return;
755 	}
756 
757 	callout_drain(&sc->ka_tx_timer);
758 	callout_drain(&sc->ka_rx_timer);
759 
760 	nvmf_shutdown_controller(sc);
761 	for (u_int i = 0; i < sc->num_io_queues; i++) {
762 		nvmf_destroy_qp(sc->io[i]);
763 	}
764 	nvmf_destroy_qp(sc->admin);
765 	sc->admin = NULL;
766 	sx_xunlock(&sc->connection_lock);
767 }
768 
769 static int
770 nvmf_detach(device_t dev)
771 {
772 	struct nvmf_softc *sc = device_get_softc(dev);
773 	u_int i;
774 
775 	destroy_dev(sc->cdev);
776 
777 	sx_xlock(&sc->connection_lock);
778 	sc->detaching = true;
779 	sx_xunlock(&sc->connection_lock);
780 
781 	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
782 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
783 
784 	nvmf_destroy_sim(sc);
785 	for (i = 0; i < sc->cdata->nn; i++) {
786 		if (sc->ns[i] != NULL)
787 			nvmf_destroy_ns(sc->ns[i]);
788 	}
789 	free(sc->ns, M_NVMF);
790 
791 	callout_drain(&sc->ka_tx_timer);
792 	callout_drain(&sc->ka_rx_timer);
793 
794 	if (sc->admin != NULL)
795 		nvmf_shutdown_controller(sc);
796 
797 	for (i = 0; i < sc->num_io_queues; i++) {
798 		nvmf_destroy_qp(sc->io[i]);
799 	}
800 	free(sc->io, M_NVMF);
801 
802 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
803 
804 	if (sc->admin != NULL)
805 		nvmf_destroy_qp(sc->admin);
806 
807 	nvmf_destroy_aer(sc);
808 
809 	sx_destroy(&sc->connection_lock);
810 	free(sc->cdata, M_NVMF);
811 	return (0);
812 }
813 
814 static void
815 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
816     const struct nvme_namespace_data *data)
817 {
818 	struct nvmf_namespace *ns;
819 
820 	/* XXX: Needs locking around sc->ns[]. */
821 	ns = sc->ns[nsid - 1];
822 	if (data->nsze == 0) {
823 		/* XXX: Needs locking */
824 		if (ns != NULL) {
825 			nvmf_destroy_ns(ns);
826 			sc->ns[nsid - 1] = NULL;
827 		}
828 	} else {
829 		/* XXX: Needs locking */
830 		if (ns == NULL) {
831 			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
832 		} else {
833 			if (!nvmf_update_ns(ns, data)) {
834 				nvmf_destroy_ns(ns);
835 				sc->ns[nsid - 1] = NULL;
836 			}
837 		}
838 	}
839 
840 	nvmf_sim_rescan_ns(sc, nsid);
841 }
842 
843 void
844 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
845 {
846 	struct nvmf_completion_status status;
847 	struct nvme_namespace_data *data;
848 
849 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
850 
851 	nvmf_status_init(&status);
852 	nvmf_status_wait_io(&status);
853 	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
854 	    &status, nvmf_io_complete, &status, M_WAITOK)) {
855 		device_printf(sc->dev,
856 		    "failed to send IDENTIFY namespace %u command\n", nsid);
857 		free(data, M_NVMF);
858 		return;
859 	}
860 	nvmf_wait_for_reply(&status);
861 
862 	if (status.cqe.status != 0) {
863 		device_printf(sc->dev,
864 		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
865 		    le16toh(status.cqe.status));
866 		free(data, M_NVMF);
867 		return;
868 	}
869 
870 	if (status.io_error != 0) {
871 		device_printf(sc->dev,
872 		    "IDENTIFY namespace %u failed with I/O error %d\n",
873 		    nsid, status.io_error);
874 		free(data, M_NVMF);
875 		return;
876 	}
877 
878 	nvme_namespace_data_swapbytes(data);
879 
880 	nvmf_rescan_ns_1(sc, nsid, data);
881 
882 	free(data, M_NVMF);
883 }
884 
885 static void
886 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
887     uint32_t next_valid_nsid)
888 {
889 	struct nvmf_namespace *ns;
890 
891 	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
892 	{
893 		/* XXX: Needs locking around sc->ns[]. */
894 		ns = sc->ns[nsid - 1];
895 		if (ns != NULL) {
896 			nvmf_destroy_ns(ns);
897 			sc->ns[nsid - 1] = NULL;
898 
899 			nvmf_sim_rescan_ns(sc, nsid);
900 		}
901 	}
902 }
903 
904 static bool
905 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
906     const struct nvme_namespace_data *data, void *arg)
907 {
908 	uint32_t *last_nsid = arg;
909 
910 	/* Check for any gaps prior to this namespace. */
911 	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
912 	*last_nsid = nsid;
913 
914 	nvmf_rescan_ns_1(sc, nsid, data);
915 	return (true);
916 }
917 
918 void
919 nvmf_rescan_all_ns(struct nvmf_softc *sc)
920 {
921 	uint32_t last_nsid;
922 
923 	last_nsid = 0;
924 	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
925 		return;
926 
927 	/*
928 	 * Check for any namespace devices after the last active
929 	 * namespace.
930 	 */
931 	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
932 }
933 
934 int
935 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
936     bool admin)
937 {
938 	struct nvmf_completion_status status;
939 	struct nvme_command cmd;
940 	struct memdesc mem;
941 	struct nvmf_host_qpair *qp;
942 	struct nvmf_request *req;
943 	void *buf;
944 	int error;
945 
946 	if (pt->len > sc->max_xfer_size)
947 		return (EINVAL);
948 
949 	buf = NULL;
950 	if (pt->len != 0) {
951 		/*
952 		 * XXX: Depending on the size we may want to pin the
953 		 * user pages and use a memdesc with vm_page_t's
954 		 * instead.
955 		 */
956 		buf = malloc(pt->len, M_NVMF, M_WAITOK);
957 		if (pt->is_read == 0) {
958 			error = copyin(pt->buf, buf, pt->len);
959 			if (error != 0) {
960 				free(buf, M_NVMF);
961 				return (error);
962 			}
963 		} else {
964 			/* Ensure no kernel data is leaked to userland. */
965 			memset(buf, 0, pt->len);
966 		}
967 	}
968 
969 	memset(&cmd, 0, sizeof(cmd));
970 	cmd.opc = pt->cmd.opc;
971 	cmd.fuse = pt->cmd.fuse;
972 	cmd.nsid = pt->cmd.nsid;
973 	cmd.cdw10 = pt->cmd.cdw10;
974 	cmd.cdw11 = pt->cmd.cdw11;
975 	cmd.cdw12 = pt->cmd.cdw12;
976 	cmd.cdw13 = pt->cmd.cdw13;
977 	cmd.cdw14 = pt->cmd.cdw14;
978 	cmd.cdw15 = pt->cmd.cdw15;
979 
980 	sx_slock(&sc->connection_lock);
981 	if (sc->admin == NULL || sc->detaching) {
982 		device_printf(sc->dev,
983 		    "failed to send passthrough command\n");
984 		error = ECONNABORTED;
985 		sx_sunlock(&sc->connection_lock);
986 		goto error;
987 	}
988 	if (admin)
989 		qp = sc->admin;
990 	else
991 		qp = nvmf_select_io_queue(sc);
992 	nvmf_status_init(&status);
993 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
994 	sx_sunlock(&sc->connection_lock);
995 	if (req == NULL) {
996 		device_printf(sc->dev, "failed to send passthrough command\n");
997 		error = ECONNABORTED;
998 		goto error;
999 	}
1000 
1001 	if (pt->len != 0) {
1002 		mem = memdesc_vaddr(buf, pt->len);
1003 		nvmf_capsule_append_data(req->nc, &mem, pt->len,
1004 		    pt->is_read == 0, nvmf_io_complete, &status);
1005 		nvmf_status_wait_io(&status);
1006 	}
1007 
1008 	nvmf_submit_request(req);
1009 	nvmf_wait_for_reply(&status);
1010 
1011 	memset(&pt->cpl, 0, sizeof(pt->cpl));
1012 	pt->cpl.cdw0 = status.cqe.cdw0;
1013 	pt->cpl.status = status.cqe.status;
1014 
1015 	error = status.io_error;
1016 	if (error == 0 && pt->len != 0 && pt->is_read != 0)
1017 		error = copyout(buf, pt->buf, pt->len);
1018 error:
1019 	free(buf, M_NVMF);
1020 	return (error);
1021 }
1022 
1023 static int
1024 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1025     struct thread *td)
1026 {
1027 	struct nvmf_softc *sc = cdev->si_drv1;
1028 	struct nvme_get_nsid *gnsid;
1029 	struct nvme_pt_command *pt;
1030 	struct nvmf_reconnect_params *rp;
1031 	struct nvmf_handoff_host *hh;
1032 
1033 	switch (cmd) {
1034 	case NVME_PASSTHROUGH_CMD:
1035 		pt = (struct nvme_pt_command *)arg;
1036 		return (nvmf_passthrough_cmd(sc, pt, true));
1037 	case NVME_GET_NSID:
1038 		gnsid = (struct nvme_get_nsid *)arg;
1039 		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1040 		    sizeof(gnsid->cdev));
1041 		gnsid->nsid = 0;
1042 		return (0);
1043 	case NVME_GET_MAX_XFER_SIZE:
1044 		*(uint64_t *)arg = sc->max_xfer_size;
1045 		return (0);
1046 	case NVMF_RECONNECT_PARAMS:
1047 		rp = (struct nvmf_reconnect_params *)arg;
1048 		if ((sc->cdata->fcatt & 1) == 0)
1049 			rp->cntlid = NVMF_CNTLID_DYNAMIC;
1050 		else
1051 			rp->cntlid = sc->cdata->ctrlr_id;
1052 		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
1053 		return (0);
1054 	case NVMF_RECONNECT_HOST:
1055 		hh = (struct nvmf_handoff_host *)arg;
1056 		return (nvmf_reconnect_host(sc, hh));
1057 	default:
1058 		return (ENOTTY);
1059 	}
1060 }
1061 
1062 static struct cdevsw nvmf_cdevsw = {
1063 	.d_version = D_VERSION,
1064 	.d_ioctl = nvmf_ioctl
1065 };
1066 
1067 static int
1068 nvmf_modevent(module_t mod, int what, void *arg)
1069 {
1070 	switch (what) {
1071 	case MOD_LOAD:
1072 		return (nvmf_ctl_load());
1073 	case MOD_QUIESCE:
1074 		return (0);
1075 	case MOD_UNLOAD:
1076 		nvmf_ctl_unload();
1077 		destroy_dev_drain(&nvmf_cdevsw);
1078 		return (0);
1079 	default:
1080 		return (EOPNOTSUPP);
1081 	}
1082 }
1083 
1084 static device_method_t nvmf_methods[] = {
1085 	/* Device interface */
1086 	DEVMETHOD(device_probe,     nvmf_probe),
1087 	DEVMETHOD(device_attach,    nvmf_attach),
1088 	DEVMETHOD(device_detach,    nvmf_detach),
1089 	DEVMETHOD_END
1090 };
1091 
1092 driver_t nvme_nvmf_driver = {
1093 	"nvme",
1094 	nvmf_methods,
1095 	sizeof(struct nvmf_softc),
1096 };
1097 
1098 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1099 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1100