xref: /freebsd/sys/dev/nvmf/controller/ctl_frontend_nvmf.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/dnv.h>
10 #include <sys/jail.h>
11 #include <sys/kernel.h>
12 #include <sys/limits.h>
13 #include <sys/lock.h>
14 #include <sys/malloc.h>
15 #include <sys/mbuf.h>
16 #include <sys/memdesc.h>
17 #include <sys/module.h>
18 #include <sys/proc.h>
19 #include <sys/queue.h>
20 #include <sys/refcount.h>
21 #include <sys/sbuf.h>
22 #include <sys/smp.h>
23 #include <sys/sx.h>
24 #include <sys/taskqueue.h>
25 
26 #include <machine/bus.h>
27 #include <machine/bus_dma.h>
28 
29 #include <dev/nvmf/nvmf.h>
30 #include <dev/nvmf/nvmf_transport.h>
31 #include <dev/nvmf/controller/nvmft_subr.h>
32 #include <dev/nvmf/controller/nvmft_var.h>
33 
34 #include <cam/ctl/ctl.h>
35 #include <cam/ctl/ctl_error.h>
36 #include <cam/ctl/ctl_ha.h>
37 #include <cam/ctl/ctl_io.h>
38 #include <cam/ctl/ctl_frontend.h>
39 #include <cam/ctl/ctl_private.h>
40 
41 /*
42  * Store pointers to the capsule and qpair in the two pointer members
43  * of CTL_PRIV_FRONTEND.
44  */
45 #define	NVMFT_NC(io)	((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0])
46 #define	NVMFT_QP(io)	((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1])
47 
48 static void	nvmft_done(union ctl_io *io);
49 static int	nvmft_init(void);
50 static int	nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data,
51     int flag, struct thread *td);
52 static int	nvmft_shutdown(void);
53 
54 static struct taskqueue *nvmft_taskq;
55 static TAILQ_HEAD(, nvmft_port) nvmft_ports;
56 static struct sx nvmft_ports_lock;
57 
58 MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller");
59 
60 static struct ctl_frontend nvmft_frontend = {
61 	.name = "nvmf",
62 	.init = nvmft_init,
63 	.ioctl = nvmft_ioctl,
64 	.fe_dump = NULL,
65 	.shutdown = nvmft_shutdown,
66 };
67 
68 static void
69 nvmft_online(void *arg)
70 {
71 	struct nvmft_port *np = arg;
72 
73 	sx_xlock(&np->lock);
74 	np->online = true;
75 	sx_xunlock(&np->lock);
76 }
77 
78 static void
79 nvmft_offline(void *arg)
80 {
81 	struct nvmft_port *np = arg;
82 	struct nvmft_controller *ctrlr;
83 
84 	sx_xlock(&np->lock);
85 	np->online = false;
86 
87 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
88 		nvmft_printf(ctrlr,
89 		    "shutting down due to port going offline\n");
90 		nvmft_controller_error(ctrlr, NULL, ENODEV);
91 	}
92 
93 	while (!TAILQ_EMPTY(&np->controllers))
94 		sx_sleep(np, &np->lock, 0, "nvmfoff", 0);
95 	sx_xunlock(&np->lock);
96 }
97 
98 static int
99 nvmft_lun_enable(void *arg, int lun_id)
100 {
101 	struct nvmft_port *np = arg;
102 	struct nvmft_controller *ctrlr;
103 	uint32_t *old_ns, *new_ns;
104 	uint32_t nsid;
105 	u_int i;
106 
107 	if (lun_id >= le32toh(np->cdata.nn)) {
108 		printf("NVMFT: %s lun %d larger than maximum nsid %u\n",
109 		    np->cdata.subnqn, lun_id, le32toh(np->cdata.nn));
110 		return (EOPNOTSUPP);
111 	}
112 	nsid = lun_id + 1;
113 
114 	sx_xlock(&np->lock);
115 	new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT,
116 	    M_WAITOK);
117 	for (i = 0; i < np->num_ns; i++) {
118 		if (np->active_ns[i] < nsid)
119 			continue;
120 		if (np->active_ns[i] == nsid) {
121 			sx_xunlock(&np->lock);
122 			free(new_ns, M_NVMFT);
123 			printf("NVMFT: %s duplicate lun %d\n",
124 			    np->cdata.subnqn, lun_id);
125 			return (EINVAL);
126 		}
127 		break;
128 	}
129 
130 	/* Copy over IDs smaller than nsid. */
131 	memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns));
132 
133 	/* Insert nsid. */
134 	new_ns[i] = nsid;
135 
136 	/* Copy over IDs greater than nsid. */
137 	memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) *
138 	    sizeof(*np->active_ns));
139 
140 	np->num_ns++;
141 	old_ns = np->active_ns;
142 	np->active_ns = new_ns;
143 
144 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
145 		nvmft_controller_lun_changed(ctrlr, lun_id);
146 	}
147 
148 	sx_xunlock(&np->lock);
149 	free(old_ns, M_NVMFT);
150 
151 	return (0);
152 }
153 
154 static int
155 nvmft_lun_disable(void *arg, int lun_id)
156 {
157 	struct nvmft_port *np = arg;
158 	struct nvmft_controller *ctrlr;
159 	uint32_t nsid;
160 	u_int i;
161 
162 	if (lun_id >= le32toh(np->cdata.nn))
163 		return (0);
164 	nsid = lun_id + 1;
165 
166 	sx_xlock(&np->lock);
167 	for (i = 0; i < np->num_ns; i++) {
168 		if (np->active_ns[i] == nsid)
169 			goto found;
170 	}
171 	sx_xunlock(&np->lock);
172 	printf("NVMFT: %s request to disable nonexistent lun %d\n",
173 	    np->cdata.subnqn, lun_id);
174 	return (EINVAL);
175 
176 found:
177 	/* Move down IDs greater than nsid. */
178 	memmove(np->active_ns + i, np->active_ns + i + 1,
179 	    (np->num_ns - (i + 1)) * sizeof(*np->active_ns));
180 	np->num_ns--;
181 
182 	/* NB: Don't bother freeing the old active_ns array. */
183 
184 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
185 		nvmft_controller_lun_changed(ctrlr, lun_id);
186 	}
187 
188 	sx_xunlock(&np->lock);
189 
190 	return (0);
191 }
192 
193 void
194 nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
195     struct nvme_ns_list *nslist)
196 {
197 	u_int i, count;
198 
199 	sx_slock(&np->lock);
200 	count = 0;
201 	for (i = 0; i < np->num_ns; i++) {
202 		if (np->active_ns[i] <= nsid)
203 			continue;
204 		nslist->ns[count] = htole32(np->active_ns[i]);
205 		count++;
206 		if (count == nitems(nslist->ns))
207 			break;
208 	}
209 	sx_sunlock(&np->lock);
210 }
211 
212 void
213 nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
214     bool admin)
215 {
216 	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
217 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
218 	struct nvmft_port *np = ctrlr->np;
219 	union ctl_io *io;
220 	int error;
221 
222 	if (cmd->nsid == htole32(0)) {
223 		nvmft_send_generic_error(qp, nc,
224 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
225 		nvmf_free_capsule(nc);
226 		return;
227 	}
228 
229 	mtx_lock(&ctrlr->lock);
230 	if (ctrlr->pending_commands == 0)
231 		ctrlr->start_busy = sbinuptime();
232 	ctrlr->pending_commands++;
233 	mtx_unlock(&ctrlr->lock);
234 	io = ctl_alloc_io(np->port.ctl_pool_ref);
235 	ctl_zero_io(io);
236 	NVMFT_NC(io) = nc;
237 	NVMFT_QP(io) = qp;
238 	io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME;
239 	io->io_hdr.nexus.initid = ctrlr->cntlid;
240 	io->io_hdr.nexus.targ_port = np->port.targ_port;
241 	io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1;
242 	io->nvmeio.cmd = *cmd;
243 	error = ctl_run(io);
244 	if (error != 0) {
245 		nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n",
246 		    nvmft_qpair_name(qp), error);
247 		ctl_nvme_set_generic_error(&io->nvmeio,
248 		    NVME_SC_INTERNAL_DEVICE_ERROR);
249 		nvmft_done(io);
250 
251 		nvmft_controller_error(ctrlr, qp, ENXIO);
252 	}
253 }
254 
255 void
256 nvmft_terminate_commands(struct nvmft_controller *ctrlr)
257 {
258 	struct nvmft_port *np = ctrlr->np;
259 	union ctl_io *io;
260 	int error;
261 
262 	mtx_lock(&ctrlr->lock);
263 	if (ctrlr->pending_commands == 0)
264 		ctrlr->start_busy = sbinuptime();
265 	ctrlr->pending_commands++;
266 	mtx_unlock(&ctrlr->lock);
267 	io = ctl_alloc_io(np->port.ctl_pool_ref);
268 	ctl_zero_io(io);
269 	NVMFT_QP(io) = ctrlr->admin;
270 	io->io_hdr.io_type = CTL_IO_TASK;
271 	io->io_hdr.nexus.initid = ctrlr->cntlid;
272 	io->io_hdr.nexus.targ_port = np->port.targ_port;
273 	io->io_hdr.nexus.targ_lun = 0;
274 	io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */
275 	io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
276 	error = ctl_run(io);
277 	if (error != CTL_RETVAL_COMPLETE) {
278 		nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error);
279 #ifdef INVARIANTS
280 		io->io_hdr.status = CTL_SUCCESS;
281 #endif
282 		nvmft_done(io);
283 	}
284 }
285 
286 static void
287 nvmft_datamove_out_cb(void *arg, size_t xfered, int error)
288 {
289 	struct ctl_nvmeio *ctnio = arg;
290 
291 	if (error != 0) {
292 		ctl_nvme_set_data_transfer_error(ctnio);
293 	} else {
294 		MPASS(xfered == ctnio->kern_data_len);
295 		ctnio->kern_data_resid -= xfered;
296 	}
297 
298 	if (ctnio->kern_sg_entries) {
299 		free(ctnio->ext_data_ptr, M_NVMFT);
300 		ctnio->ext_data_ptr = NULL;
301 	} else
302 		MPASS(ctnio->ext_data_ptr == NULL);
303 	ctl_datamove_done((union ctl_io *)ctnio, false);
304 }
305 
306 static void
307 nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
308     struct nvmf_capsule *nc)
309 {
310 	struct memdesc mem;
311 	int error;
312 
313 	MPASS(ctnio->ext_data_ptr == NULL);
314 	if (ctnio->kern_sg_entries > 0) {
315 		struct ctl_sg_entry *sgl;
316 		struct bus_dma_segment *vlist;
317 
318 		vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist),
319 		    M_NVMFT, M_WAITOK);
320 		ctnio->ext_data_ptr = (void *)vlist;
321 		sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
322 		for (u_int i = 0; i < ctnio->kern_sg_entries; i++) {
323 			vlist[i].ds_addr = (uintptr_t)sgl[i].addr;
324 			vlist[i].ds_len = sgl[i].len;
325 		}
326 		mem = memdesc_vlist(vlist, ctnio->kern_sg_entries);
327 	} else
328 		mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len);
329 
330 	error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem,
331 	    ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio);
332 	if (error == 0)
333 		return;
334 
335 	nvmft_printf(nvmft_qpair_ctrlr(qp),
336 	    "Failed to request capsule data: %d\n", error);
337 	ctl_nvme_set_data_transfer_error(ctnio);
338 
339 	if (ctnio->kern_sg_entries) {
340 		free(ctnio->ext_data_ptr, M_NVMFT);
341 		ctnio->ext_data_ptr = NULL;
342 	} else
343 		MPASS(ctnio->ext_data_ptr == NULL);
344 	ctl_datamove_done((union ctl_io *)ctnio, true);
345 }
346 
347 static struct mbuf *
348 nvmft_copy_data(struct ctl_nvmeio *ctnio)
349 {
350 	struct ctl_sg_entry *sgl;
351 	struct mbuf *m0, *m;
352 	uint32_t resid, off, todo;
353 	int mlen;
354 
355 	MPASS(ctnio->kern_data_len != 0);
356 
357 	m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0);
358 
359 	if (ctnio->kern_sg_entries == 0) {
360 		m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr);
361 		return (m0);
362 	}
363 
364 	resid = ctnio->kern_data_len;
365 	sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
366 	off = 0;
367 	m = m0;
368 	mlen = M_TRAILINGSPACE(m);
369 	for (;;) {
370 		todo = MIN(mlen, sgl->len - off);
371 		memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off,
372 		    todo);
373 		m->m_len += todo;
374 		resid -= todo;
375 		if (resid == 0) {
376 			MPASS(m->m_next == NULL);
377 			break;
378 		}
379 
380 		off += todo;
381 		if (off == sgl->len) {
382 			sgl++;
383 			off = 0;
384 		}
385 		mlen -= todo;
386 		if (mlen == 0) {
387 			m = m->m_next;
388 			mlen = M_TRAILINGSPACE(m);
389 		}
390 	}
391 
392 	return (m0);
393 }
394 
395 static void
396 m_free_ref_data(struct mbuf *m)
397 {
398 	ctl_ref kern_data_ref = m->m_ext.ext_arg1;
399 
400 	kern_data_ref(m->m_ext.ext_arg2, -1);
401 }
402 
403 static struct mbuf *
404 m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size)
405 {
406 	struct mbuf *m;
407 
408 	m = m_get(M_WAITOK, MT_DATA);
409 	m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref,
410 	    ctnio->kern_data_arg, M_RDONLY, EXT_CTL);
411 	m->m_len = size;
412 	ctnio->kern_data_ref(ctnio->kern_data_arg, 1);
413 	return (m);
414 }
415 
416 static struct mbuf *
417 nvmft_ref_data(struct ctl_nvmeio *ctnio)
418 {
419 	struct ctl_sg_entry *sgl;
420 	struct mbuf *m0, *m;
421 
422 	MPASS(ctnio->kern_data_len != 0);
423 
424 	if (ctnio->kern_sg_entries == 0)
425 		return (m_get_ref_data(ctnio, ctnio->kern_data_ptr,
426 		    ctnio->kern_data_len));
427 
428 	sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
429 	m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len);
430 	m = m0;
431 	for (u_int i = 1; i < ctnio->kern_sg_entries; i++) {
432 		m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len);
433 		m = m->m_next;
434 	}
435 	return (m0);
436 }
437 
438 static void
439 nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
440     struct nvmf_capsule *nc)
441 {
442 	struct mbuf *m;
443 	u_int status;
444 
445 	if (ctnio->kern_data_ref != NULL)
446 		m = nvmft_ref_data(ctnio);
447 	else
448 		m = nvmft_copy_data(ctnio);
449 	status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m,
450 	    ctnio->kern_data_len);
451 	switch (status) {
452 	case NVMF_SUCCESS_SENT:
453 		ctnio->success_sent = true;
454 		nvmft_command_completed(qp, nc);
455 		/* FALLTHROUGH */
456 	case NVMF_MORE:
457 	case NVME_SC_SUCCESS:
458 		break;
459 	default:
460 		ctl_nvme_set_generic_error(ctnio, status);
461 		break;
462 	}
463 	ctl_datamove_done((union ctl_io *)ctnio, true);
464 }
465 
466 void
467 nvmft_handle_datamove(union ctl_io *io)
468 {
469 	struct nvmf_capsule *nc;
470 	struct nvmft_qpair *qp;
471 
472 	/* Some CTL commands preemptively set a success status. */
473 	MPASS(io->io_hdr.status == CTL_STATUS_NONE ||
474 	    io->io_hdr.status == CTL_SUCCESS);
475 	MPASS(!io->nvmeio.success_sent);
476 
477 	nc = NVMFT_NC(io);
478 	qp = NVMFT_QP(io);
479 
480 	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
481 		nvmft_datamove_in(&io->nvmeio, qp, nc);
482 	else
483 		nvmft_datamove_out(&io->nvmeio, qp, nc);
484 }
485 
486 void
487 nvmft_abort_datamove(union ctl_io *io)
488 {
489 	io->io_hdr.port_status = 1;
490 	io->io_hdr.flags |= CTL_FLAG_ABORT;
491 	ctl_datamove_done(io, true);
492 }
493 
494 static void
495 nvmft_datamove(union ctl_io *io)
496 {
497 	struct nvmft_qpair *qp;
498 
499 	qp = NVMFT_QP(io);
500 	nvmft_qpair_datamove(qp, io);
501 }
502 
503 void
504 nvmft_enqueue_task(struct task *task)
505 {
506 	taskqueue_enqueue(nvmft_taskq, task);
507 }
508 
509 void
510 nvmft_drain_task(struct task *task)
511 {
512 	taskqueue_drain(nvmft_taskq, task);
513 }
514 
515 static void
516 hip_add(uint64_t pair[2], uint64_t addend)
517 {
518 	uint64_t old, new;
519 
520 	old = le64toh(pair[0]);
521 	new = old + addend;
522 	pair[0] = htole64(new);
523 	if (new < old)
524 		pair[1] += htole64(1);
525 }
526 
527 static void
528 nvmft_done(union ctl_io *io)
529 {
530 	struct nvmft_controller *ctrlr;
531 	const struct nvme_command *cmd;
532 	struct nvmft_qpair *qp;
533 	struct nvmf_capsule *nc;
534 	size_t len;
535 
536 	KASSERT(io->io_hdr.status == CTL_SUCCESS ||
537 	    io->io_hdr.status == CTL_NVME_ERROR,
538 	    ("%s: bad status %u", __func__, io->io_hdr.status));
539 
540 	nc = NVMFT_NC(io);
541 	qp = NVMFT_QP(io);
542 	ctrlr = nvmft_qpair_ctrlr(qp);
543 
544 	if (nc == NULL) {
545 		/* Completion of nvmft_terminate_commands. */
546 		goto end;
547 	}
548 
549 	cmd = nvmf_capsule_sqe(nc);
550 
551 	if (io->io_hdr.status == CTL_SUCCESS)
552 		len = nvmf_capsule_data_len(nc) / 512;
553 	else
554 		len = 0;
555 	switch (cmd->opc) {
556 	case NVME_OPC_WRITE:
557 		mtx_lock(&ctrlr->lock);
558 		hip_add(ctrlr->hip.host_write_commands, 1);
559 		len += ctrlr->partial_duw;
560 		if (len > 1000)
561 			hip_add(ctrlr->hip.data_units_written, len / 1000);
562 		ctrlr->partial_duw = len % 1000;
563 		mtx_unlock(&ctrlr->lock);
564 		break;
565 	case NVME_OPC_READ:
566 	case NVME_OPC_COMPARE:
567 	case NVME_OPC_VERIFY:
568 		mtx_lock(&ctrlr->lock);
569 		if (cmd->opc != NVME_OPC_VERIFY)
570 			hip_add(ctrlr->hip.host_read_commands, 1);
571 		len += ctrlr->partial_dur;
572 		if (len > 1000)
573 			hip_add(ctrlr->hip.data_units_read, len / 1000);
574 		ctrlr->partial_dur = len % 1000;
575 		mtx_unlock(&ctrlr->lock);
576 		break;
577 	}
578 
579 	if (io->nvmeio.success_sent) {
580 		MPASS(io->io_hdr.status == CTL_SUCCESS);
581 	} else {
582 		io->nvmeio.cpl.cid = cmd->cid;
583 		nvmft_send_response(qp, &io->nvmeio.cpl);
584 	}
585 	nvmf_free_capsule(nc);
586 end:
587 	ctl_free_io(io);
588 	mtx_lock(&ctrlr->lock);
589 	ctrlr->pending_commands--;
590 	if (ctrlr->pending_commands == 0)
591 		ctrlr->busy_total += sbinuptime() - ctrlr->start_busy;
592 	mtx_unlock(&ctrlr->lock);
593 }
594 
595 static int
596 nvmft_init(void)
597 {
598 	int error;
599 
600 	nvmft_taskq = taskqueue_create("nvmft", M_WAITOK,
601 	    taskqueue_thread_enqueue, &nvmft_taskq);
602 	error = taskqueue_start_threads_in_proc(&nvmft_taskq, mp_ncpus, PWAIT,
603 	    control_softc->ctl_proc, "nvmft");
604 	if (error != 0) {
605 		taskqueue_free(nvmft_taskq);
606 		return (error);
607 	}
608 
609 	TAILQ_INIT(&nvmft_ports);
610 	sx_init(&nvmft_ports_lock, "nvmft ports");
611 	return (0);
612 }
613 
614 void
615 nvmft_port_free(struct nvmft_port *np)
616 {
617 	KASSERT(TAILQ_EMPTY(&np->controllers),
618 	    ("%s(%p): active controllers", __func__, np));
619 
620 	if (np->port.targ_port != -1) {
621 		if (ctl_port_deregister(&np->port) != 0)
622 			printf("%s: ctl_port_deregister() failed\n", __func__);
623 	}
624 
625 	free(np->active_ns, M_NVMFT);
626 	clean_unrhdr(np->ids);
627 	delete_unrhdr(np->ids);
628 	sx_destroy(&np->lock);
629 	free(np, M_NVMFT);
630 }
631 
632 static struct nvmft_port *
633 nvmft_port_find(const char *subnqn)
634 {
635 	struct nvmft_port *np;
636 
637 	KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__));
638 
639 	sx_assert(&nvmft_ports_lock, SA_LOCKED);
640 	TAILQ_FOREACH(np, &nvmft_ports, link) {
641 		if (strcmp(np->cdata.subnqn, subnqn) == 0)
642 			break;
643 	}
644 	return (np);
645 }
646 
647 static struct nvmft_port *
648 nvmft_port_find_by_id(int port_id)
649 {
650 	struct nvmft_port *np;
651 
652 	sx_assert(&nvmft_ports_lock, SA_LOCKED);
653 	TAILQ_FOREACH(np, &nvmft_ports, link) {
654 		if (np->port.targ_port == port_id)
655 			break;
656 	}
657 	return (np);
658 }
659 
660 /*
661  * Helper function to fetch a number stored as a string in an nv_list.
662  * Returns false if the string was not a valid number.
663  */
664 static bool
665 dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value,
666 	u_long *value)
667 {
668 	const char *str;
669 	char *cp;
670 
671 	str = dnvlist_get_string(nvl, name, NULL);
672 	if (str == NULL) {
673 		*value = default_value;
674 		return (true);
675 	}
676 	if (*str == '\0')
677 		return (false);
678 	*value = strtoul(str, &cp, 0);
679 	if (*cp != '\0')
680 		return (false);
681 	return (true);
682 }
683 
684 /*
685  * NVMeoF ports support the following parameters:
686  *
687  * Mandatory:
688  *
689  * subnqn: subsystem NVMe Qualified Name
690  * portid: integer port ID from Discovery Log Page entry
691  *
692  * Optional:
693  * serial: Serial Number string
694  * max_io_qsize: Maximum number of I/O queue entries
695  * enable_timeout: Timeout for controller enable in milliseconds
696  * ioccsz: Maximum command capsule size
697  * iorcsz: Maximum response capsule size
698  * nn: Number of namespaces
699  */
700 static void
701 nvmft_port_create(struct ctl_req *req)
702 {
703 	struct nvmft_port *np;
704 	struct ctl_port *port;
705 	const char *serial, *subnqn;
706 	char serial_buf[NVME_SERIAL_NUMBER_LENGTH];
707 	u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid;
708 	int error;
709 
710 	/* Required parameters. */
711 	subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
712 	if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) {
713 		req->status = CTL_LUN_ERROR;
714 		snprintf(req->error_str, sizeof(req->error_str),
715 		    "Missing required argument");
716 		return;
717 	}
718 	if (!nvmf_nqn_valid(subnqn)) {
719 		req->status = CTL_LUN_ERROR;
720 		snprintf(req->error_str, sizeof(req->error_str),
721 		    "Invalid SubNQN");
722 		return;
723 	}
724 	if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) ||
725 	    portid > UINT16_MAX) {
726 		req->status = CTL_LUN_ERROR;
727 		snprintf(req->error_str, sizeof(req->error_str),
728 		    "Invalid port ID");
729 		return;
730 	}
731 
732 	/* Optional parameters. */
733 	if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize",
734 	    NVMF_MAX_IO_ENTRIES, &max_io_qsize) ||
735 	    max_io_qsize < NVME_MIN_IO_ENTRIES ||
736 	    max_io_qsize > NVME_MAX_IO_ENTRIES) {
737 		req->status = CTL_LUN_ERROR;
738 		snprintf(req->error_str, sizeof(req->error_str),
739 		    "Invalid maximum I/O queue size");
740 		return;
741 	}
742 
743 	if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout",
744 	    NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) ||
745 	    (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) {
746 		req->status = CTL_LUN_ERROR;
747 		snprintf(req->error_str, sizeof(req->error_str),
748 		    "Invalid enable timeout");
749 		return;
750 	}
751 
752 	if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ,
753 	    &ioccsz) || ioccsz < sizeof(struct nvme_command) ||
754 	    (ioccsz % 16) != 0) {
755 		req->status = CTL_LUN_ERROR;
756 		snprintf(req->error_str, sizeof(req->error_str),
757 		    "Invalid Command Capsule size");
758 		return;
759 	}
760 
761 	if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ,
762 	    &iorcsz) || iorcsz < sizeof(struct nvme_completion) ||
763 	    (iorcsz % 16) != 0) {
764 		req->status = CTL_LUN_ERROR;
765 		snprintf(req->error_str, sizeof(req->error_str),
766 		    "Invalid Response Capsule size");
767 		return;
768 	}
769 
770 	if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) ||
771 	    nn < 1 || nn > UINT32_MAX) {
772 		req->status = CTL_LUN_ERROR;
773 		snprintf(req->error_str, sizeof(req->error_str),
774 		    "Invalid number of namespaces");
775 		return;
776 	}
777 
778 	serial = dnvlist_get_string(req->args_nvl, "serial", NULL);
779 	if (serial == NULL) {
780 		getcredhostid(curthread->td_ucred, &hostid);
781 		nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid);
782 		serial = serial_buf;
783 	}
784 
785 	sx_xlock(&nvmft_ports_lock);
786 
787 	np = nvmft_port_find(subnqn);
788 	if (np != NULL) {
789 		req->status = CTL_LUN_ERROR;
790 		snprintf(req->error_str, sizeof(req->error_str),
791 		    "SubNQN \"%s\" already exists", subnqn);
792 		sx_xunlock(&nvmft_ports_lock);
793 		return;
794 	}
795 
796 	np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO);
797 	refcount_init(&np->refs, 1);
798 	np->max_io_qsize = max_io_qsize;
799 	np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500);
800 	sx_init(&np->lock, "nvmft port");
801 	np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1,
802 	    NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX);
803 	TAILQ_INIT(&np->controllers);
804 
805 	/* The controller ID is set later for individual controllers. */
806 	_nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype,
807 	    osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata);
808 	np->cdata.aerl = NVMFT_NUM_AER - 1;
809 	np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE);
810 	np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) |
811 	    NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) |
812 	    NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) |
813 	    NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1));
814 	np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1);
815 
816 	np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
817 	memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr));
818 
819 	port = &np->port;
820 
821 	port->frontend = &nvmft_frontend;
822 	port->port_type = CTL_PORT_NVMF;
823 	port->num_requested_ctl_io = max_io_qsize;
824 	port->port_name = "nvmf";
825 	port->physical_port = portid;
826 	port->virtual_port = 0;
827 	port->port_online = nvmft_online;
828 	port->port_offline = nvmft_offline;
829 	port->onoff_arg = np;
830 	port->lun_enable = nvmft_lun_enable;
831 	port->lun_disable = nvmft_lun_disable;
832 	port->targ_lun_arg = np;
833 	port->fe_datamove = nvmft_datamove;
834 	port->fe_done = nvmft_done;
835 	port->targ_port = -1;
836 	port->options = nvlist_clone(req->args_nvl);
837 
838 	error = ctl_port_register(port);
839 	if (error != 0) {
840 		sx_xunlock(&nvmft_ports_lock);
841 		nvlist_destroy(port->options);
842 		nvmft_port_rele(np);
843 		req->status = CTL_LUN_ERROR;
844 		snprintf(req->error_str, sizeof(req->error_str),
845 		    "Failed to register CTL port with error %d", error);
846 		return;
847 	}
848 
849 	TAILQ_INSERT_TAIL(&nvmft_ports, np, link);
850 	sx_xunlock(&nvmft_ports_lock);
851 
852 	req->status = CTL_LUN_OK;
853 	req->result_nvl = nvlist_create(0);
854 	nvlist_add_number(req->result_nvl, "port_id", port->targ_port);
855 }
856 
857 static void
858 nvmft_port_remove(struct ctl_req *req)
859 {
860 	struct nvmft_port *np;
861 	const char *subnqn;
862 	u_long port_id;
863 
864 	/*
865 	 * ctladm port -r just provides the port_id, so permit looking
866 	 * up a port either by "subnqn" or "port_id".
867 	 */
868 	port_id = ULONG_MAX;
869 	subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
870 	if (subnqn == NULL) {
871 		if (!nvlist_exists_string(req->args_nvl, "port_id")) {
872 			req->status = CTL_LUN_ERROR;
873 			snprintf(req->error_str, sizeof(req->error_str),
874 			    "Missing required argument");
875 			return;
876 		}
877 		if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX,
878 		    &port_id)) {
879 			req->status = CTL_LUN_ERROR;
880 			snprintf(req->error_str, sizeof(req->error_str),
881 			    "Invalid CTL port ID");
882 			return;
883 		}
884 	} else {
885 		if (nvlist_exists_string(req->args_nvl, "port_id")) {
886 			req->status = CTL_LUN_ERROR;
887 			snprintf(req->error_str, sizeof(req->error_str),
888 			    "Ambiguous port removal request");
889 			return;
890 		}
891 	}
892 
893 	sx_xlock(&nvmft_ports_lock);
894 
895 	if (subnqn != NULL) {
896 		np = nvmft_port_find(subnqn);
897 		if (np == NULL) {
898 			req->status = CTL_LUN_ERROR;
899 			snprintf(req->error_str, sizeof(req->error_str),
900 			    "SubNQN \"%s\" does not exist", subnqn);
901 			sx_xunlock(&nvmft_ports_lock);
902 			return;
903 		}
904 	} else {
905 		np = nvmft_port_find_by_id(port_id);
906 		if (np == NULL) {
907 			req->status = CTL_LUN_ERROR;
908 			snprintf(req->error_str, sizeof(req->error_str),
909 			    "CTL port %lu is not a NVMF port", port_id);
910 			sx_xunlock(&nvmft_ports_lock);
911 			return;
912 		}
913 	}
914 
915 	TAILQ_REMOVE(&nvmft_ports, np, link);
916 	sx_xunlock(&nvmft_ports_lock);
917 
918 	ctl_port_offline(&np->port);
919 	nvmft_port_rele(np);
920 	req->status = CTL_LUN_OK;
921 }
922 
923 static void
924 nvmft_handoff(struct ctl_nvmf *cn)
925 {
926 	const struct nvmf_fabric_connect_cmd *cmd;
927 	const struct nvmf_fabric_connect_data *data;
928 	const nvlist_t *params;
929 	struct nvmft_port *np;
930 	nvlist_t *nvl;
931 	size_t len;
932 	enum nvmf_trtype trtype;
933 	int error;
934 
935 	np = NULL;
936 	error = nvmf_unpack_ioc_nvlist(&cn->data.handoff, &nvl);
937 	if (error != 0) {
938 		cn->status = CTL_NVMF_ERROR;
939 		snprintf(cn->error_str, sizeof(cn->error_str),
940 		    "Failed to copyin and unpack handoff arguments");
941 		return;
942 	}
943 
944 	if (!nvlist_exists_number(nvl, "trtype") ||
945 	    !nvlist_exists_nvlist(nvl, "params") ||
946 	    !nvlist_exists_binary(nvl, "cmd") ||
947 	    !nvlist_exists_binary(nvl, "data")) {
948 		cn->status = CTL_NVMF_ERROR;
949 		snprintf(cn->error_str, sizeof(cn->error_str),
950 		    "Handoff arguments missing required value");
951 		goto out;
952 	}
953 
954 	params = nvlist_get_nvlist(nvl, "params");
955 	if (!nvmf_validate_qpair_nvlist(params, true)) {
956 		cn->status = CTL_NVMF_ERROR;
957 		snprintf(cn->error_str, sizeof(cn->error_str),
958 		    "Invalid queue pair parameters");
959 		goto out;
960 	}
961 
962 	cmd = nvlist_get_binary(nvl, "cmd", &len);
963 	if (len != sizeof(*cmd)) {
964 		cn->status = CTL_NVMF_ERROR;
965 		snprintf(cn->error_str, sizeof(cn->error_str),
966 		    "Wrong size for CONNECT SQE");
967 		goto out;
968 	}
969 
970 	data = nvlist_get_binary(nvl, "data", &len);
971 	if (len != sizeof(*data)) {
972 		cn->status = CTL_NVMF_ERROR;
973 		snprintf(cn->error_str, sizeof(cn->error_str),
974 		    "Wrong size for CONNECT data");
975 		goto out;
976 	}
977 
978 	if (!nvmf_nqn_valid(data->subnqn)) {
979 		cn->status = CTL_NVMF_ERROR;
980 		snprintf(cn->error_str, sizeof(cn->error_str),
981 		    "Invalid SubNQN");
982 		goto out;
983 	}
984 
985 	sx_slock(&nvmft_ports_lock);
986 	np = nvmft_port_find(data->subnqn);
987 	if (np == NULL) {
988 		sx_sunlock(&nvmft_ports_lock);
989 		cn->status = CTL_NVMF_ERROR;
990 		snprintf(cn->error_str, sizeof(cn->error_str),
991 		    "Unknown SubNQN");
992 		goto out;
993 	}
994 	if (!np->online) {
995 		sx_sunlock(&nvmft_ports_lock);
996 		cn->status = CTL_NVMF_ERROR;
997 		snprintf(cn->error_str, sizeof(cn->error_str),
998 		    "CTL port offline");
999 		np = NULL;
1000 		goto out;
1001 	}
1002 	nvmft_port_ref(np);
1003 	sx_sunlock(&nvmft_ports_lock);
1004 
1005 	trtype = nvlist_get_number(nvl, "trtype");
1006 	if (nvlist_get_bool(params, "admin")) {
1007 		error = nvmft_handoff_admin_queue(np, trtype, params, cmd,
1008 		    data);
1009 		if (error != 0) {
1010 			cn->status = CTL_NVMF_ERROR;
1011 			snprintf(cn->error_str, sizeof(cn->error_str),
1012 			    "Failed to handoff admin queue: %d", error);
1013 			goto out;
1014 		}
1015 	} else {
1016 		error = nvmft_handoff_io_queue(np, trtype, params, cmd, data);
1017 		if (error != 0) {
1018 			cn->status = CTL_NVMF_ERROR;
1019 			snprintf(cn->error_str, sizeof(cn->error_str),
1020 			    "Failed to handoff I/O queue: %d", error);
1021 			goto out;
1022 		}
1023 	}
1024 
1025 	cn->status = CTL_NVMF_OK;
1026 out:
1027 	if (np != NULL)
1028 		nvmft_port_rele(np);
1029 	nvlist_destroy(nvl);
1030 }
1031 
1032 static void
1033 nvmft_list(struct ctl_nvmf *cn)
1034 {
1035 	struct ctl_nvmf_list_params *lp;
1036 	struct nvmft_controller *ctrlr;
1037 	struct nvmft_port *np;
1038 	struct sbuf *sb;
1039 	int error;
1040 
1041 	lp = &cn->data.list;
1042 
1043 	sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN |
1044 	    SBUF_INCLUDENUL);
1045 	if (sb == NULL) {
1046 		cn->status = CTL_NVMF_ERROR;
1047 		snprintf(cn->error_str, sizeof(cn->error_str),
1048 		    "Failed to allocate NVMeoF session list");
1049 		return;
1050 	}
1051 
1052 	sbuf_printf(sb, "<ctlnvmflist>\n");
1053 	sx_slock(&nvmft_ports_lock);
1054 	TAILQ_FOREACH(np, &nvmft_ports, link) {
1055 		sx_slock(&np->lock);
1056 		TAILQ_FOREACH(ctrlr, &np->controllers, link) {
1057 			sbuf_printf(sb, "<connection id=\"%d\">"
1058 			    "<hostnqn>%s</hostnqn>"
1059 			    "<subnqn>%s</subnqn>"
1060 			    "<trtype>%u</trtype>"
1061 			    "</connection>\n",
1062 			    ctrlr->cntlid,
1063 			    ctrlr->hostnqn,
1064 			    np->cdata.subnqn,
1065 			    ctrlr->trtype);
1066 		}
1067 		sx_sunlock(&np->lock);
1068 	}
1069 	sx_sunlock(&nvmft_ports_lock);
1070 	sbuf_printf(sb, "</ctlnvmflist>\n");
1071 	if (sbuf_finish(sb) != 0) {
1072 		sbuf_delete(sb);
1073 		cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE;
1074 		snprintf(cn->error_str, sizeof(cn->error_str),
1075 		    "Out of space, %d bytes is too small", lp->alloc_len);
1076 		return;
1077 	}
1078 
1079 	error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb));
1080 	if (error != 0) {
1081 		sbuf_delete(sb);
1082 		cn->status = CTL_NVMF_ERROR;
1083 		snprintf(cn->error_str, sizeof(cn->error_str),
1084 		    "Failed to copyout session list: %d", error);
1085 		return;
1086 	}
1087 	lp->fill_len = sbuf_len(sb);
1088 	cn->status = CTL_NVMF_OK;
1089 	sbuf_delete(sb);
1090 }
1091 
1092 static void
1093 nvmft_terminate(struct ctl_nvmf *cn)
1094 {
1095 	struct ctl_nvmf_terminate_params *tp;
1096 	struct nvmft_controller *ctrlr;
1097 	struct nvmft_port *np;
1098 	bool found, match;
1099 
1100 	tp = &cn->data.terminate;
1101 
1102 	found = false;
1103 	sx_slock(&nvmft_ports_lock);
1104 	TAILQ_FOREACH(np, &nvmft_ports, link) {
1105 		sx_slock(&np->lock);
1106 		TAILQ_FOREACH(ctrlr, &np->controllers, link) {
1107 			if (tp->all != 0)
1108 				match = true;
1109 			else if (tp->cntlid != -1)
1110 				match = tp->cntlid == ctrlr->cntlid;
1111 			else if (tp->hostnqn[0] != '\0')
1112 				match = strncmp(tp->hostnqn, ctrlr->hostnqn,
1113 				    sizeof(tp->hostnqn)) == 0;
1114 			else
1115 				match = false;
1116 			if (!match)
1117 				continue;
1118 			nvmft_printf(ctrlr,
1119 			    "disconnecting due to administrative request\n");
1120 			nvmft_controller_error(ctrlr, NULL, ECONNABORTED);
1121 			found = true;
1122 		}
1123 		sx_sunlock(&np->lock);
1124 	}
1125 	sx_sunlock(&nvmft_ports_lock);
1126 
1127 	if (!found) {
1128 		cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND;
1129 		snprintf(cn->error_str, sizeof(cn->error_str),
1130 		    "No matching associations found");
1131 		return;
1132 	}
1133 	cn->status = CTL_NVMF_OK;
1134 }
1135 
1136 static int
1137 nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag,
1138     struct thread *td)
1139 {
1140 	struct ctl_nvmf *cn;
1141 	struct ctl_req *req;
1142 
1143 	switch (cmd) {
1144 	case CTL_PORT_REQ:
1145 		req = (struct ctl_req *)data;
1146 		switch (req->reqtype) {
1147 		case CTL_REQ_CREATE:
1148 			nvmft_port_create(req);
1149 			break;
1150 		case CTL_REQ_REMOVE:
1151 			nvmft_port_remove(req);
1152 			break;
1153 		default:
1154 			req->status = CTL_LUN_ERROR;
1155 			snprintf(req->error_str, sizeof(req->error_str),
1156 			    "Unsupported request type %d", req->reqtype);
1157 			break;
1158 		}
1159 		return (0);
1160 	case CTL_NVMF:
1161 		cn = (struct ctl_nvmf *)data;
1162 		switch (cn->type) {
1163 		case CTL_NVMF_HANDOFF:
1164 			nvmft_handoff(cn);
1165 			break;
1166 		case CTL_NVMF_LIST:
1167 			nvmft_list(cn);
1168 			break;
1169 		case CTL_NVMF_TERMINATE:
1170 			nvmft_terminate(cn);
1171 			break;
1172 		default:
1173 			cn->status = CTL_NVMF_ERROR;
1174 			snprintf(cn->error_str, sizeof(cn->error_str),
1175 			    "Invalid NVMeoF request type %d", cn->type);
1176 			break;
1177 		}
1178 		return (0);
1179 	default:
1180 		return (ENOTTY);
1181 	}
1182 }
1183 
1184 static int
1185 nvmft_shutdown(void)
1186 {
1187 	/* TODO: Need to check for active controllers. */
1188 	if (!TAILQ_EMPTY(&nvmft_ports))
1189 		return (EBUSY);
1190 
1191 	taskqueue_free(nvmft_taskq);
1192 	sx_destroy(&nvmft_ports_lock);
1193 	return (0);
1194 }
1195 
1196 CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend);
1197 MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1);
1198