xref: /freebsd/sys/dev/nvmf/host/nvmf.c (revision 38e1083940a274783cc9e8ebd845e35df6d0a1ff)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/dnv.h>
12 #include <sys/eventhandler.h>
13 #include <sys/lock.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/module.h>
18 #include <sys/mutex.h>
19 #include <sys/nv.h>
20 #include <sys/reboot.h>
21 #include <sys/sx.h>
22 #include <sys/sysctl.h>
23 #include <sys/taskqueue.h>
24 #include <dev/nvme/nvme.h>
25 #include <dev/nvmf/nvmf.h>
26 #include <dev/nvmf/nvmf_transport.h>
27 #include <dev/nvmf/host/nvmf_var.h>
28 
29 static struct cdevsw nvmf_cdevsw;
30 
31 bool nvmf_fail_disconnect = false;
32 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
33     &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
34 
35 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
36 
37 static void	nvmf_disconnect_task(void *arg, int pending);
38 static void	nvmf_shutdown_pre_sync(void *arg, int howto);
39 static void	nvmf_shutdown_post_sync(void *arg, int howto);
40 
41 void
42 nvmf_complete(void *arg, const struct nvme_completion *cqe)
43 {
44 	struct nvmf_completion_status *status = arg;
45 	struct mtx *mtx;
46 
47 	status->cqe = *cqe;
48 	mtx = mtx_pool_find(mtxpool_sleep, status);
49 	mtx_lock(mtx);
50 	status->done = true;
51 	mtx_unlock(mtx);
52 	wakeup(status);
53 }
54 
55 void
56 nvmf_io_complete(void *arg, size_t xfered, int error)
57 {
58 	struct nvmf_completion_status *status = arg;
59 	struct mtx *mtx;
60 
61 	status->io_error = error;
62 	mtx = mtx_pool_find(mtxpool_sleep, status);
63 	mtx_lock(mtx);
64 	status->io_done = true;
65 	mtx_unlock(mtx);
66 	wakeup(status);
67 }
68 
69 void
70 nvmf_wait_for_reply(struct nvmf_completion_status *status)
71 {
72 	struct mtx *mtx;
73 
74 	mtx = mtx_pool_find(mtxpool_sleep, status);
75 	mtx_lock(mtx);
76 	while (!status->done || !status->io_done)
77 		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
78 	mtx_unlock(mtx);
79 }
80 
81 static int
82 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
83     uint64_t *value)
84 {
85 	const struct nvmf_fabric_prop_get_rsp *rsp;
86 	struct nvmf_completion_status status;
87 
88 	nvmf_status_init(&status);
89 	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
90 	    M_WAITOK))
91 		return (ECONNABORTED);
92 	nvmf_wait_for_reply(&status);
93 
94 	if (status.cqe.status != 0) {
95 		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
96 		    le16toh(status.cqe.status));
97 		return (EIO);
98 	}
99 
100 	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
101 	if (size == 8)
102 		*value = le64toh(rsp->value.u64);
103 	else
104 		*value = le32toh(rsp->value.u32.low);
105 	return (0);
106 }
107 
108 static int
109 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
110     uint64_t value)
111 {
112 	struct nvmf_completion_status status;
113 
114 	nvmf_status_init(&status);
115 	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
116 	    M_WAITOK))
117 		return (ECONNABORTED);
118 	nvmf_wait_for_reply(&status);
119 
120 	if (status.cqe.status != 0) {
121 		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
122 		    le16toh(status.cqe.status));
123 		return (EIO);
124 	}
125 	return (0);
126 }
127 
128 static void
129 nvmf_shutdown_controller(struct nvmf_softc *sc)
130 {
131 	uint64_t cc;
132 	int error;
133 
134 	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
135 	if (error != 0) {
136 		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
137 		return;
138 	}
139 
140 	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
141 
142 	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
143 	if (error != 0)
144 		device_printf(sc->dev,
145 		    "Failed to set CC to trigger shutdown\n");
146 }
147 
148 static void
149 nvmf_check_keep_alive(void *arg)
150 {
151 	struct nvmf_softc *sc = arg;
152 	int traffic;
153 
154 	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
155 	if (traffic == 0) {
156 		device_printf(sc->dev,
157 		    "disconnecting due to KeepAlive timeout\n");
158 		nvmf_disconnect(sc);
159 		return;
160 	}
161 
162 	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
163 }
164 
165 static void
166 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
167 {
168 	struct nvmf_softc *sc = arg;
169 
170 	atomic_store_int(&sc->ka_active_rx_traffic, 1);
171 	if (cqe->status != 0) {
172 		device_printf(sc->dev,
173 		    "KeepAlive response reported status %#x\n",
174 		    le16toh(cqe->status));
175 	}
176 }
177 
178 static void
179 nvmf_send_keep_alive(void *arg)
180 {
181 	struct nvmf_softc *sc = arg;
182 	int traffic;
183 
184 	/*
185 	 * Don't bother sending a KeepAlive command if TKAS is active
186 	 * and another command has been sent during the interval.
187 	 */
188 	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
189 	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
190 	    sc, M_NOWAIT))
191 		device_printf(sc->dev,
192 		    "Failed to allocate KeepAlive command\n");
193 
194 	/* Clear ka_active_tx_traffic after sending the keep alive command. */
195 	atomic_store_int(&sc->ka_active_tx_traffic, 0);
196 
197 	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
198 }
199 
200 int
201 nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
202 {
203 	const struct nvme_discovery_log_entry *dle;
204 	const struct nvme_controller_data *cdata;
205 	const nvlist_t *const *io;
206 	const nvlist_t *admin, *rparams;
207 	nvlist_t *nvl;
208 	size_t i, num_io_queues;
209 	uint32_t qsize;
210 	int error;
211 
212 	error = nvmf_unpack_ioc_nvlist(nv, &nvl);
213 	if (error != 0)
214 		return (error);
215 
216 	if (!nvlist_exists_number(nvl, "trtype") ||
217 	    !nvlist_exists_nvlist(nvl, "admin") ||
218 	    !nvlist_exists_nvlist_array(nvl, "io") ||
219 	    !nvlist_exists_binary(nvl, "cdata") ||
220 	    !nvlist_exists_nvlist(nvl, "rparams"))
221 		goto invalid;
222 
223 	rparams = nvlist_get_nvlist(nvl, "rparams");
224 	if (!nvlist_exists_binary(rparams, "dle") ||
225 	    !nvlist_exists_string(rparams, "hostnqn") ||
226 	    !nvlist_exists_number(rparams, "num_io_queues") ||
227 	    !nvlist_exists_number(rparams, "io_qsize"))
228 		goto invalid;
229 
230 	admin = nvlist_get_nvlist(nvl, "admin");
231 	if (!nvmf_validate_qpair_nvlist(admin, false))
232 		goto invalid;
233 	if (!nvlist_get_bool(admin, "admin"))
234 		goto invalid;
235 
236 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
237 	if (num_io_queues < 1 ||
238 	    num_io_queues != nvlist_get_number(rparams, "num_io_queues"))
239 		goto invalid;
240 	for (i = 0; i < num_io_queues; i++) {
241 		if (!nvmf_validate_qpair_nvlist(io[i], false))
242 			goto invalid;
243 	}
244 
245 	/* Require all I/O queues to be the same size. */
246 	qsize = nvlist_get_number(rparams, "io_qsize");
247 	for (i = 0; i < num_io_queues; i++) {
248 		if (nvlist_get_number(io[i], "qsize") != qsize)
249 			goto invalid;
250 	}
251 
252 	cdata = nvlist_get_binary(nvl, "cdata", &i);
253 	if (i != sizeof(*cdata))
254 		goto invalid;
255 	dle = nvlist_get_binary(rparams, "dle", &i);
256 	if (i != sizeof(*dle))
257 		goto invalid;
258 
259 	if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0)
260 		goto invalid;
261 
262 	*nvlp = nvl;
263 	return (0);
264 invalid:
265 	nvlist_destroy(nvl);
266 	return (EINVAL);
267 }
268 
269 static int
270 nvmf_probe(device_t dev)
271 {
272 	const nvlist_t *nvl = device_get_ivars(dev);
273 	const struct nvme_controller_data *cdata;
274 
275 	if (nvl == NULL)
276 		return (ENXIO);
277 
278 	cdata = nvlist_get_binary(nvl, "cdata", NULL);
279 	device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
280 	return (BUS_PROBE_DEFAULT);
281 }
282 
283 static int
284 nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
285 {
286 	const nvlist_t *const *io;
287 	const nvlist_t *admin;
288 	uint64_t kato;
289 	size_t num_io_queues;
290 	enum nvmf_trtype trtype;
291 	char name[16];
292 
293 	trtype = nvlist_get_number(nvl, "trtype");
294 	admin = nvlist_get_nvlist(nvl, "admin");
295 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
296 	kato = dnvlist_get_number(nvl, "kato", 0);
297 
298 	/* Setup the admin queue. */
299 	sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
300 	if (sc->admin == NULL) {
301 		device_printf(sc->dev, "Failed to setup admin queue\n");
302 		return (ENXIO);
303 	}
304 
305 	/* Setup I/O queues. */
306 	sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
307 	    M_WAITOK | M_ZERO);
308 	sc->num_io_queues = num_io_queues;
309 	for (u_int i = 0; i < sc->num_io_queues; i++) {
310 		snprintf(name, sizeof(name), "I/O queue %u", i);
311 		sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
312 		if (sc->io[i] == NULL) {
313 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
314 			    i);
315 			return (ENXIO);
316 		}
317 	}
318 
319 	/* Start KeepAlive timers. */
320 	if (kato != 0) {
321 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
322 		    sc->cdata->ctratt) != 0;
323 		sc->ka_rx_sbt = mstosbt(kato);
324 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
325 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
326 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
327 		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
328 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
329 	}
330 
331 	memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
332 	    sizeof(*sc->cdata));
333 
334 	/* Save reconnect parameters. */
335 	nvlist_destroy(sc->rparams);
336 	sc->rparams = nvlist_take_nvlist(nvl, "rparams");
337 
338 	return (0);
339 }
340 
341 typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
342     const struct nvme_namespace_data *, void *);
343 
344 static bool
345 nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
346     struct nvme_namespace_data *data, uint32_t *nsidp,
347     nvmf_scan_active_ns_cb *cb, void *cb_arg)
348 {
349 	struct nvmf_completion_status status;
350 	uint32_t nsid;
351 
352 	nvmf_status_init(&status);
353 	nvmf_status_wait_io(&status);
354 	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
355 	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
356 		device_printf(sc->dev,
357 		    "failed to send IDENTIFY active namespaces command\n");
358 		return (false);
359 	}
360 	nvmf_wait_for_reply(&status);
361 
362 	if (status.cqe.status != 0) {
363 		device_printf(sc->dev,
364 		    "IDENTIFY active namespaces failed, status %#x\n",
365 		    le16toh(status.cqe.status));
366 		return (false);
367 	}
368 
369 	if (status.io_error != 0) {
370 		device_printf(sc->dev,
371 		    "IDENTIFY active namespaces failed with I/O error %d\n",
372 		    status.io_error);
373 		return (false);
374 	}
375 
376 	for (u_int i = 0; i < nitems(nslist->ns); i++) {
377 		nsid = nslist->ns[i];
378 		if (nsid == 0) {
379 			*nsidp = 0;
380 			return (true);
381 		}
382 
383 		nvmf_status_init(&status);
384 		nvmf_status_wait_io(&status);
385 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
386 		    &status, nvmf_io_complete, &status, M_WAITOK)) {
387 			device_printf(sc->dev,
388 			    "failed to send IDENTIFY namespace %u command\n",
389 			    nsid);
390 			return (false);
391 		}
392 		nvmf_wait_for_reply(&status);
393 
394 		if (status.cqe.status != 0) {
395 			device_printf(sc->dev,
396 			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
397 			    le16toh(status.cqe.status));
398 			return (false);
399 		}
400 
401 		if (status.io_error != 0) {
402 			device_printf(sc->dev,
403 			    "IDENTIFY namespace %u failed with I/O error %d\n",
404 			    nsid, status.io_error);
405 			return (false);
406 		}
407 
408 		nvme_namespace_data_swapbytes(data);
409 		if (!cb(sc, nsid, data, cb_arg))
410 			return (false);
411 	}
412 
413 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
414 
415 	if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
416 		*nsidp = 0;
417 	else
418 		*nsidp = nsid;
419 	return (true);
420 }
421 
422 static bool
423 nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
424     void *cb_arg)
425 {
426 	struct nvme_namespace_data *data;
427 	struct nvme_ns_list *nslist;
428 	uint32_t nsid;
429 	bool retval;
430 
431 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
432 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
433 
434 	nsid = 0;
435 	retval = true;
436 	for (;;) {
437 		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
438 		    cb_arg)) {
439 			retval = false;
440 			break;
441 		}
442 		if (nsid == 0)
443 			break;
444 	}
445 
446 	free(data, M_NVMF);
447 	free(nslist, M_NVMF);
448 	return (retval);
449 }
450 
451 static bool
452 nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
453     const struct nvme_namespace_data *data, void *arg __unused)
454 {
455 	if (sc->ns[nsid - 1] != NULL) {
456 		device_printf(sc->dev,
457 		    "duplicate namespace %u in active namespace list\n",
458 		    nsid);
459 		return (false);
460 	}
461 
462 	/*
463 	 * As in nvme_ns_construct, a size of zero indicates an
464 	 * invalid namespace.
465 	 */
466 	if (data->nsze == 0) {
467 		device_printf(sc->dev,
468 		    "ignoring active namespace %u with zero size\n", nsid);
469 		return (true);
470 	}
471 
472 	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
473 
474 	nvmf_sim_rescan_ns(sc, nsid);
475 	return (true);
476 }
477 
478 static bool
479 nvmf_add_namespaces(struct nvmf_softc *sc)
480 {
481 	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
482 	    M_WAITOK | M_ZERO);
483 	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
484 }
485 
486 static int
487 nvmf_attach(device_t dev)
488 {
489 	struct make_dev_args mda;
490 	struct nvmf_softc *sc = device_get_softc(dev);
491 	nvlist_t *nvl = device_get_ivars(dev);
492 	const nvlist_t * const *io;
493 	struct sysctl_oid *oid;
494 	uint64_t val;
495 	u_int i;
496 	int error;
497 
498 	if (nvl == NULL)
499 		return (ENXIO);
500 
501 	sc->dev = dev;
502 	sc->trtype = nvlist_get_number(nvl, "trtype");
503 	callout_init(&sc->ka_rx_timer, 1);
504 	callout_init(&sc->ka_tx_timer, 1);
505 	sx_init(&sc->connection_lock, "nvmf connection");
506 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
507 
508 	oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
509 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
510 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
511 	sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
512 
513 	sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
514 
515 	nvmf_init_aer(sc);
516 
517 	error = nvmf_establish_connection(sc, nvl);
518 	if (error != 0)
519 		goto out;
520 
521 	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
522 	if (error != 0) {
523 		device_printf(sc->dev, "Failed to fetch CAP\n");
524 		error = ENXIO;
525 		goto out;
526 	}
527 
528 	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
529 	if (error != 0) {
530 		device_printf(sc->dev, "Failed to fetch VS\n");
531 		error = ENXIO;
532 		goto out;
533 	}
534 	sc->vs = val;
535 
536 	/* Honor MDTS if it is set. */
537 	sc->max_xfer_size = maxphys;
538 	if (sc->cdata->mdts != 0) {
539 		sc->max_xfer_size = ulmin(sc->max_xfer_size,
540 		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
541 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
542 	}
543 
544 	io = nvlist_get_nvlist_array(nvl, "io", NULL);
545 	sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
546 	    sc->num_io_queues;
547 
548 	error = nvmf_init_sim(sc);
549 	if (error != 0)
550 		goto out;
551 
552 	error = nvmf_start_aer(sc);
553 	if (error != 0) {
554 		nvmf_destroy_sim(sc);
555 		goto out;
556 	}
557 
558 	if (!nvmf_add_namespaces(sc)) {
559 		nvmf_destroy_sim(sc);
560 		goto out;
561 	}
562 
563 	make_dev_args_init(&mda);
564 	mda.mda_devsw = &nvmf_cdevsw;
565 	mda.mda_uid = UID_ROOT;
566 	mda.mda_gid = GID_WHEEL;
567 	mda.mda_mode = 0600;
568 	mda.mda_si_drv1 = sc;
569 	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
570 	if (error != 0) {
571 		nvmf_destroy_sim(sc);
572 		goto out;
573 	}
574 
575 	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
576 	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
577 	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
578 	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);
579 
580 	return (0);
581 out:
582 	if (sc->ns != NULL) {
583 		for (i = 0; i < sc->cdata->nn; i++) {
584 			if (sc->ns[i] != NULL)
585 				nvmf_destroy_ns(sc->ns[i]);
586 		}
587 		free(sc->ns, M_NVMF);
588 	}
589 
590 	callout_drain(&sc->ka_tx_timer);
591 	callout_drain(&sc->ka_rx_timer);
592 
593 	if (sc->admin != NULL)
594 		nvmf_shutdown_controller(sc);
595 
596 	for (i = 0; i < sc->num_io_queues; i++) {
597 		if (sc->io[i] != NULL)
598 			nvmf_destroy_qp(sc->io[i]);
599 	}
600 	free(sc->io, M_NVMF);
601 	if (sc->admin != NULL)
602 		nvmf_destroy_qp(sc->admin);
603 
604 	nvmf_destroy_aer(sc);
605 
606 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
607 	sx_destroy(&sc->connection_lock);
608 	nvlist_destroy(sc->rparams);
609 	free(sc->cdata, M_NVMF);
610 	return (error);
611 }
612 
613 void
614 nvmf_disconnect(struct nvmf_softc *sc)
615 {
616 	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
617 }
618 
619 static void
620 nvmf_disconnect_task(void *arg, int pending __unused)
621 {
622 	struct nvmf_softc *sc = arg;
623 	u_int i;
624 
625 	sx_xlock(&sc->connection_lock);
626 	if (sc->admin == NULL) {
627 		/*
628 		 * Ignore transport errors if there is no active
629 		 * association.
630 		 */
631 		sx_xunlock(&sc->connection_lock);
632 		return;
633 	}
634 
635 	if (sc->detaching) {
636 		if (sc->admin != NULL) {
637 			/*
638 			 * This unsticks the detach process if a
639 			 * transport error occurs during detach.
640 			 */
641 			nvmf_shutdown_qp(sc->admin);
642 		}
643 		sx_xunlock(&sc->connection_lock);
644 		return;
645 	}
646 
647 	if (sc->cdev == NULL) {
648 		/*
649 		 * Transport error occurred during attach (nvmf_add_namespaces).
650 		 * Shutdown the admin queue.
651 		 */
652 		nvmf_shutdown_qp(sc->admin);
653 		sx_xunlock(&sc->connection_lock);
654 		return;
655 	}
656 
657 	nanotime(&sc->last_disconnect);
658 	callout_drain(&sc->ka_tx_timer);
659 	callout_drain(&sc->ka_rx_timer);
660 	sc->ka_traffic = false;
661 
662 	/* Quiesce namespace consumers. */
663 	nvmf_disconnect_sim(sc);
664 	for (i = 0; i < sc->cdata->nn; i++) {
665 		if (sc->ns[i] != NULL)
666 			nvmf_disconnect_ns(sc->ns[i]);
667 	}
668 
669 	/* Shutdown the existing qpairs. */
670 	for (i = 0; i < sc->num_io_queues; i++) {
671 		nvmf_destroy_qp(sc->io[i]);
672 	}
673 	free(sc->io, M_NVMF);
674 	sc->io = NULL;
675 	sc->num_io_queues = 0;
676 	nvmf_destroy_qp(sc->admin);
677 	sc->admin = NULL;
678 
679 	sx_xunlock(&sc->connection_lock);
680 }
681 
682 static int
683 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
684 {
685 	const struct nvme_controller_data *cdata;
686 	nvlist_t *nvl;
687 	u_int i;
688 	int error;
689 
690 	error = nvmf_copyin_handoff(nv, &nvl);
691 	if (error != 0)
692 		return (error);
693 
694 	/* XXX: Should we permit changing the transport type? */
695 	if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
696 		device_printf(sc->dev,
697 		    "transport type mismatch on reconnect\n");
698 		return (EINVAL);
699 	}
700 
701 	sx_xlock(&sc->connection_lock);
702 	if (sc->admin != NULL || sc->detaching) {
703 		error = EBUSY;
704 		goto out;
705 	}
706 
707 	/*
708 	 * Ensure this is for the same controller.  Note that the
709 	 * controller ID can vary across associations if the remote
710 	 * system is using the dynamic controller model.  This merely
711 	 * ensures the new association is connected to the same NVMe
712 	 * subsystem.
713 	 */
714 	cdata = nvlist_get_binary(nvl, "cdata", NULL);
715 	if (memcmp(sc->cdata->subnqn, cdata->subnqn,
716 	    sizeof(cdata->subnqn)) != 0) {
717 		device_printf(sc->dev,
718 		    "controller subsystem NQN mismatch on reconnect\n");
719 		error = EINVAL;
720 		goto out;
721 	}
722 
723 	/*
724 	 * XXX: Require same number and size of I/O queues so that
725 	 * max_pending_io is still correct?
726 	 */
727 
728 	error = nvmf_establish_connection(sc, nvl);
729 	if (error != 0)
730 		goto out;
731 
732 	error = nvmf_start_aer(sc);
733 	if (error != 0)
734 		goto out;
735 
736 	device_printf(sc->dev,
737 	    "established new association with %u I/O queues\n",
738 	    sc->num_io_queues);
739 
740 	/* Restart namespace consumers. */
741 	for (i = 0; i < sc->cdata->nn; i++) {
742 		if (sc->ns[i] != NULL)
743 			nvmf_reconnect_ns(sc->ns[i]);
744 	}
745 	nvmf_reconnect_sim(sc);
746 
747 	nvmf_rescan_all_ns(sc);
748 out:
749 	sx_xunlock(&sc->connection_lock);
750 	nvlist_destroy(nvl);
751 	return (error);
752 }
753 
754 static void
755 nvmf_shutdown_pre_sync(void *arg, int howto)
756 {
757 	struct nvmf_softc *sc = arg;
758 
759 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
760 		return;
761 
762 	/*
763 	 * If this association is disconnected, abort any pending
764 	 * requests with an error to permit filesystems to unmount
765 	 * without hanging.
766 	 */
767 	sx_xlock(&sc->connection_lock);
768 	if (sc->admin != NULL || sc->detaching) {
769 		sx_xunlock(&sc->connection_lock);
770 		return;
771 	}
772 
773 	for (u_int i = 0; i < sc->cdata->nn; i++) {
774 		if (sc->ns[i] != NULL)
775 			nvmf_shutdown_ns(sc->ns[i]);
776 	}
777 	nvmf_shutdown_sim(sc);
778 	sx_xunlock(&sc->connection_lock);
779 }
780 
781 static void
782 nvmf_shutdown_post_sync(void *arg, int howto)
783 {
784 	struct nvmf_softc *sc = arg;
785 
786 	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
787 		return;
788 
789 	/*
790 	 * If this association is connected, disconnect gracefully.
791 	 */
792 	sx_xlock(&sc->connection_lock);
793 	if (sc->admin == NULL || sc->detaching) {
794 		sx_xunlock(&sc->connection_lock);
795 		return;
796 	}
797 
798 	callout_drain(&sc->ka_tx_timer);
799 	callout_drain(&sc->ka_rx_timer);
800 
801 	nvmf_shutdown_controller(sc);
802 
803 	/*
804 	 * Quiesce consumers so that any commands submitted after this
805 	 * fail with an error.  Notably, nda(4) calls nda_flush() from
806 	 * a post_sync handler that might be ordered after this one.
807 	 */
808 	for (u_int i = 0; i < sc->cdata->nn; i++) {
809 		if (sc->ns[i] != NULL)
810 			nvmf_shutdown_ns(sc->ns[i]);
811 	}
812 	nvmf_shutdown_sim(sc);
813 
814 	for (u_int i = 0; i < sc->num_io_queues; i++) {
815 		nvmf_destroy_qp(sc->io[i]);
816 	}
817 	nvmf_destroy_qp(sc->admin);
818 	sc->admin = NULL;
819 	sx_xunlock(&sc->connection_lock);
820 }
821 
822 static int
823 nvmf_detach(device_t dev)
824 {
825 	struct nvmf_softc *sc = device_get_softc(dev);
826 	u_int i;
827 
828 	destroy_dev(sc->cdev);
829 
830 	sx_xlock(&sc->connection_lock);
831 	sc->detaching = true;
832 	sx_xunlock(&sc->connection_lock);
833 
834 	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
835 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
836 
837 	nvmf_destroy_sim(sc);
838 	for (i = 0; i < sc->cdata->nn; i++) {
839 		if (sc->ns[i] != NULL)
840 			nvmf_destroy_ns(sc->ns[i]);
841 	}
842 	free(sc->ns, M_NVMF);
843 
844 	callout_drain(&sc->ka_tx_timer);
845 	callout_drain(&sc->ka_rx_timer);
846 
847 	if (sc->admin != NULL)
848 		nvmf_shutdown_controller(sc);
849 
850 	for (i = 0; i < sc->num_io_queues; i++) {
851 		nvmf_destroy_qp(sc->io[i]);
852 	}
853 	free(sc->io, M_NVMF);
854 
855 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
856 
857 	if (sc->admin != NULL)
858 		nvmf_destroy_qp(sc->admin);
859 
860 	nvmf_destroy_aer(sc);
861 
862 	sx_destroy(&sc->connection_lock);
863 	nvlist_destroy(sc->rparams);
864 	free(sc->cdata, M_NVMF);
865 	return (0);
866 }
867 
868 static void
869 nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
870     const struct nvme_namespace_data *data)
871 {
872 	struct nvmf_namespace *ns;
873 
874 	/* XXX: Needs locking around sc->ns[]. */
875 	ns = sc->ns[nsid - 1];
876 	if (data->nsze == 0) {
877 		/* XXX: Needs locking */
878 		if (ns != NULL) {
879 			nvmf_destroy_ns(ns);
880 			sc->ns[nsid - 1] = NULL;
881 		}
882 	} else {
883 		/* XXX: Needs locking */
884 		if (ns == NULL) {
885 			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
886 		} else {
887 			if (!nvmf_update_ns(ns, data)) {
888 				nvmf_destroy_ns(ns);
889 				sc->ns[nsid - 1] = NULL;
890 			}
891 		}
892 	}
893 
894 	nvmf_sim_rescan_ns(sc, nsid);
895 }
896 
897 void
898 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
899 {
900 	struct nvmf_completion_status status;
901 	struct nvme_namespace_data *data;
902 
903 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
904 
905 	nvmf_status_init(&status);
906 	nvmf_status_wait_io(&status);
907 	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
908 	    &status, nvmf_io_complete, &status, M_WAITOK)) {
909 		device_printf(sc->dev,
910 		    "failed to send IDENTIFY namespace %u command\n", nsid);
911 		free(data, M_NVMF);
912 		return;
913 	}
914 	nvmf_wait_for_reply(&status);
915 
916 	if (status.cqe.status != 0) {
917 		device_printf(sc->dev,
918 		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
919 		    le16toh(status.cqe.status));
920 		free(data, M_NVMF);
921 		return;
922 	}
923 
924 	if (status.io_error != 0) {
925 		device_printf(sc->dev,
926 		    "IDENTIFY namespace %u failed with I/O error %d\n",
927 		    nsid, status.io_error);
928 		free(data, M_NVMF);
929 		return;
930 	}
931 
932 	nvme_namespace_data_swapbytes(data);
933 
934 	nvmf_rescan_ns_1(sc, nsid, data);
935 
936 	free(data, M_NVMF);
937 }
938 
939 static void
940 nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
941     uint32_t next_valid_nsid)
942 {
943 	struct nvmf_namespace *ns;
944 
945 	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
946 	{
947 		/* XXX: Needs locking around sc->ns[]. */
948 		ns = sc->ns[nsid - 1];
949 		if (ns != NULL) {
950 			nvmf_destroy_ns(ns);
951 			sc->ns[nsid - 1] = NULL;
952 
953 			nvmf_sim_rescan_ns(sc, nsid);
954 		}
955 	}
956 }
957 
958 static bool
959 nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
960     const struct nvme_namespace_data *data, void *arg)
961 {
962 	uint32_t *last_nsid = arg;
963 
964 	/* Check for any gaps prior to this namespace. */
965 	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
966 	*last_nsid = nsid;
967 
968 	nvmf_rescan_ns_1(sc, nsid, data);
969 	return (true);
970 }
971 
972 void
973 nvmf_rescan_all_ns(struct nvmf_softc *sc)
974 {
975 	uint32_t last_nsid;
976 
977 	last_nsid = 0;
978 	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
979 		return;
980 
981 	/*
982 	 * Check for any namespace devices after the last active
983 	 * namespace.
984 	 */
985 	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
986 }
987 
988 int
989 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
990     bool admin)
991 {
992 	struct nvmf_completion_status status;
993 	struct nvme_command cmd;
994 	struct memdesc mem;
995 	struct nvmf_host_qpair *qp;
996 	struct nvmf_request *req;
997 	void *buf;
998 	int error;
999 
1000 	if (pt->len > sc->max_xfer_size)
1001 		return (EINVAL);
1002 
1003 	buf = NULL;
1004 	if (pt->len != 0) {
1005 		/*
1006 		 * XXX: Depending on the size we may want to pin the
1007 		 * user pages and use a memdesc with vm_page_t's
1008 		 * instead.
1009 		 */
1010 		buf = malloc(pt->len, M_NVMF, M_WAITOK);
1011 		if (pt->is_read == 0) {
1012 			error = copyin(pt->buf, buf, pt->len);
1013 			if (error != 0) {
1014 				free(buf, M_NVMF);
1015 				return (error);
1016 			}
1017 		} else {
1018 			/* Ensure no kernel data is leaked to userland. */
1019 			memset(buf, 0, pt->len);
1020 		}
1021 	}
1022 
1023 	memset(&cmd, 0, sizeof(cmd));
1024 	cmd.opc = pt->cmd.opc;
1025 	cmd.fuse = pt->cmd.fuse;
1026 	cmd.nsid = pt->cmd.nsid;
1027 	cmd.cdw10 = pt->cmd.cdw10;
1028 	cmd.cdw11 = pt->cmd.cdw11;
1029 	cmd.cdw12 = pt->cmd.cdw12;
1030 	cmd.cdw13 = pt->cmd.cdw13;
1031 	cmd.cdw14 = pt->cmd.cdw14;
1032 	cmd.cdw15 = pt->cmd.cdw15;
1033 
1034 	sx_slock(&sc->connection_lock);
1035 	if (sc->admin == NULL || sc->detaching) {
1036 		device_printf(sc->dev,
1037 		    "failed to send passthrough command\n");
1038 		error = ECONNABORTED;
1039 		sx_sunlock(&sc->connection_lock);
1040 		goto error;
1041 	}
1042 	if (admin)
1043 		qp = sc->admin;
1044 	else
1045 		qp = nvmf_select_io_queue(sc);
1046 	nvmf_status_init(&status);
1047 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
1048 	sx_sunlock(&sc->connection_lock);
1049 	if (req == NULL) {
1050 		device_printf(sc->dev, "failed to send passthrough command\n");
1051 		error = ECONNABORTED;
1052 		goto error;
1053 	}
1054 
1055 	if (pt->len != 0) {
1056 		mem = memdesc_vaddr(buf, pt->len);
1057 		nvmf_capsule_append_data(req->nc, &mem, pt->len,
1058 		    pt->is_read == 0, nvmf_io_complete, &status);
1059 		nvmf_status_wait_io(&status);
1060 	}
1061 
1062 	nvmf_submit_request(req);
1063 	nvmf_wait_for_reply(&status);
1064 
1065 	memset(&pt->cpl, 0, sizeof(pt->cpl));
1066 	pt->cpl.cdw0 = status.cqe.cdw0;
1067 	pt->cpl.status = status.cqe.status;
1068 
1069 	error = status.io_error;
1070 	if (error == 0 && pt->len != 0 && pt->is_read != 0)
1071 		error = copyout(buf, pt->buf, pt->len);
1072 error:
1073 	free(buf, M_NVMF);
1074 	return (error);
1075 }
1076 
1077 static int
1078 nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1079 {
1080 	int error;
1081 
1082 	sx_slock(&sc->connection_lock);
1083 	error = nvmf_pack_ioc_nvlist(sc->rparams, nv);
1084 	sx_sunlock(&sc->connection_lock);
1085 
1086 	return (error);
1087 }
1088 
1089 static int
1090 nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
1091 {
1092 	nvlist_t *nvl, *nvl_ts;
1093 	int error;
1094 
1095 	nvl = nvlist_create(0);
1096 	nvl_ts = nvlist_create(0);
1097 
1098 	sx_slock(&sc->connection_lock);
1099 	nvlist_add_bool(nvl, "connected", sc->admin != NULL);
1100 	nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec);
1101 	nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec);
1102 	sx_sunlock(&sc->connection_lock);
1103 	nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts);
1104 
1105 	error = nvmf_pack_ioc_nvlist(nvl, nv);
1106 	nvlist_destroy(nvl);
1107 	return (error);
1108 }
1109 
1110 static int
1111 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1112     struct thread *td)
1113 {
1114 	struct nvmf_softc *sc = cdev->si_drv1;
1115 	struct nvme_get_nsid *gnsid;
1116 	struct nvme_pt_command *pt;
1117 	struct nvmf_ioc_nv *nv;
1118 
1119 	switch (cmd) {
1120 	case NVME_PASSTHROUGH_CMD:
1121 		pt = (struct nvme_pt_command *)arg;
1122 		return (nvmf_passthrough_cmd(sc, pt, true));
1123 	case NVME_GET_NSID:
1124 		gnsid = (struct nvme_get_nsid *)arg;
1125 		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1126 		    sizeof(gnsid->cdev));
1127 		gnsid->nsid = 0;
1128 		return (0);
1129 	case NVME_GET_MAX_XFER_SIZE:
1130 		*(uint64_t *)arg = sc->max_xfer_size;
1131 		return (0);
1132 	case NVME_GET_CONTROLLER_DATA:
1133 		memcpy(arg, sc->cdata, sizeof(*sc->cdata));
1134 		return (0);
1135 	case NVMF_RECONNECT_PARAMS:
1136 		nv = (struct nvmf_ioc_nv *)arg;
1137 		return (nvmf_reconnect_params(sc, nv));
1138 	case NVMF_RECONNECT_HOST:
1139 		nv = (struct nvmf_ioc_nv *)arg;
1140 		return (nvmf_reconnect_host(sc, nv));
1141 	case NVMF_CONNECTION_STATUS:
1142 		nv = (struct nvmf_ioc_nv *)arg;
1143 		return (nvmf_connection_status(sc, nv));
1144 	default:
1145 		return (ENOTTY);
1146 	}
1147 }
1148 
1149 static struct cdevsw nvmf_cdevsw = {
1150 	.d_version = D_VERSION,
1151 	.d_ioctl = nvmf_ioctl
1152 };
1153 
1154 static int
1155 nvmf_modevent(module_t mod, int what, void *arg)
1156 {
1157 	switch (what) {
1158 	case MOD_LOAD:
1159 		return (nvmf_ctl_load());
1160 	case MOD_QUIESCE:
1161 		return (0);
1162 	case MOD_UNLOAD:
1163 		nvmf_ctl_unload();
1164 		destroy_dev_drain(&nvmf_cdevsw);
1165 		return (0);
1166 	default:
1167 		return (EOPNOTSUPP);
1168 	}
1169 }
1170 
1171 static device_method_t nvmf_methods[] = {
1172 	/* Device interface */
1173 	DEVMETHOD(device_probe,     nvmf_probe),
1174 	DEVMETHOD(device_attach,    nvmf_attach),
1175 	DEVMETHOD(device_detach,    nvmf_detach),
1176 	DEVMETHOD_END
1177 };
1178 
1179 driver_t nvme_nvmf_driver = {
1180 	"nvme",
1181 	nvmf_methods,
1182 	sizeof(struct nvmf_softc),
1183 };
1184 
1185 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1186 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1187