xref: /freebsd/sys/dev/nvmf/controller/nvmft_controller.c (revision a8089ea5aee578e08acab2438e82fc9a9ae50ed8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/callout.h>
10 #include <sys/kernel.h>
11 #include <sys/lock.h>
12 #include <sys/malloc.h>
13 #include <sys/mbuf.h>
14 #include <sys/memdesc.h>
15 #include <sys/mutex.h>
16 #include <sys/sbuf.h>
17 #include <sys/sx.h>
18 #include <sys/taskqueue.h>
19 
20 #include <dev/nvmf/nvmf_transport.h>
21 #include <dev/nvmf/controller/nvmft_subr.h>
22 #include <dev/nvmf/controller/nvmft_var.h>
23 
24 static void	nvmft_controller_shutdown(void *arg, int pending);
25 static void	nvmft_controller_terminate(void *arg, int pending);
26 
27 int
28 nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
29 {
30 	char buf[128];
31 	struct sbuf sb;
32 	va_list ap;
33 	size_t retval;
34 
35 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
36 	sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
37 
38 	sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
39 
40 	va_start(ap, fmt);
41 	sbuf_vprintf(&sb, fmt, ap);
42 	va_end(ap);
43 
44 	sbuf_finish(&sb);
45 	sbuf_delete(&sb);
46 
47 	return (retval);
48 }
49 
50 static struct nvmft_controller *
51 nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
52     const struct nvmf_fabric_connect_data *data)
53 {
54 	struct nvmft_controller *ctrlr;
55 
56 	ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
57 	ctrlr->cntlid = cntlid;
58 	nvmft_port_ref(np);
59 	TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
60 	ctrlr->np = np;
61 	mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
62 	callout_init(&ctrlr->ka_timer, 1);
63 	TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
64 	TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
65 	    nvmft_controller_terminate, ctrlr);
66 
67 	ctrlr->cdata = np->cdata;
68 	ctrlr->cdata.ctrlr_id = htole16(cntlid);
69 	memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
70 	memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
71 	ctrlr->hip.power_cycles[0] = 1;
72 	ctrlr->create_time = sbinuptime();
73 
74 	ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
75 	    M_WAITOK | M_ZERO);
76 
77 	return (ctrlr);
78 }
79 
80 static void
81 nvmft_controller_free(struct nvmft_controller *ctrlr)
82 {
83 	mtx_destroy(&ctrlr->lock);
84 	MPASS(ctrlr->io_qpairs == NULL);
85 	free(ctrlr->changed_ns, M_NVMFT);
86 	free(ctrlr, M_NVMFT);
87 }
88 
89 static void
90 nvmft_keep_alive_timer(void *arg)
91 {
92 	struct nvmft_controller *ctrlr = arg;
93 	int traffic;
94 
95 	if (ctrlr->shutdown)
96 		return;
97 
98 	traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
99 	if (traffic == 0) {
100 		nvmft_printf(ctrlr,
101 		    "disconnecting due to KeepAlive timeout\n");
102 		nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
103 		return;
104 	}
105 
106 	callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
107 }
108 
109 int
110 nvmft_handoff_admin_queue(struct nvmft_port *np,
111     const struct nvmf_handoff_controller_qpair *handoff,
112     const struct nvmf_fabric_connect_cmd *cmd,
113     const struct nvmf_fabric_connect_data *data)
114 {
115 	struct nvmft_controller *ctrlr;
116 	struct nvmft_qpair *qp;
117 	uint32_t kato;
118 	int cntlid;
119 
120 	if (cmd->qid != htole16(0))
121 		return (EINVAL);
122 
123 	qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
124 	    "admin queue");
125 
126 	sx_xlock(&np->lock);
127 	cntlid = alloc_unr(np->ids);
128 	if (cntlid == -1) {
129 		sx_xunlock(&np->lock);
130 		printf("NVMFT: Unable to allocate controller for %.*s\n",
131 		    (int)sizeof(data->hostnqn), data->hostnqn);
132 		nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
133 		    NVMF_FABRIC_SC_INVALID_HOST);
134 		nvmft_qpair_destroy(qp);
135 		return (ENOMEM);
136 	}
137 
138 #ifdef INVARIANTS
139 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
140 		KASSERT(ctrlr->cntlid != cntlid,
141 		    ("%s: duplicate controllers with id %d", __func__, cntlid));
142 	}
143 #endif
144 
145 	ctrlr = nvmft_controller_alloc(np, cntlid, data);
146 	nvmft_printf(ctrlr, "associated with %.*s\n",
147 	    (int)sizeof(data->hostnqn), data->hostnqn);
148 	ctrlr->admin = qp;
149 	ctrlr->trtype = handoff->trtype;
150 
151 	/*
152 	 * The spec requires a non-zero KeepAlive timer, but allow a
153 	 * zero KATO value to match Linux.
154 	 */
155 	kato = le32toh(cmd->kato);
156 	if (kato != 0) {
157 		/*
158 		 * Round up to 1 second matching granularity
159 		 * advertised in cdata.
160 		 */
161 		ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
162 		callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
163 		    nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
164 	}
165 
166 	nvmft_finish_accept(qp, cmd, ctrlr);
167 	sx_xunlock(&np->lock);
168 
169 	return (0);
170 }
171 
172 int
173 nvmft_handoff_io_queue(struct nvmft_port *np,
174     const struct nvmf_handoff_controller_qpair *handoff,
175     const struct nvmf_fabric_connect_cmd *cmd,
176     const struct nvmf_fabric_connect_data *data)
177 {
178 	struct nvmft_controller *ctrlr;
179 	struct nvmft_qpair *qp;
180 	char name[16];
181 	uint16_t cntlid, qid;
182 
183 	qid = le16toh(cmd->qid);
184 	if (qid == 0)
185 		return (EINVAL);
186 	cntlid = le16toh(data->cntlid);
187 
188 	snprintf(name, sizeof(name), "I/O queue %u", qid);
189 	qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
190 
191 	sx_slock(&np->lock);
192 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
193 		if (ctrlr->cntlid == cntlid)
194 			break;
195 	}
196 	if (ctrlr == NULL) {
197 		sx_sunlock(&np->lock);
198 		printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
199 		    ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
200 		    data->hostnqn);
201 		nvmft_connect_invalid_parameters(qp, cmd, true,
202 		    offsetof(struct nvmf_fabric_connect_data, cntlid));
203 		nvmft_qpair_destroy(qp);
204 		return (ENOENT);
205 	}
206 
207 	if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
208 		sx_sunlock(&np->lock);
209 		nvmft_printf(ctrlr,
210 		    "hostid mismatch for I/O queue %u from %.*s\n", qid,
211 		    (int)sizeof(data->hostnqn), data->hostnqn);
212 		nvmft_connect_invalid_parameters(qp, cmd, true,
213 		    offsetof(struct nvmf_fabric_connect_data, hostid));
214 		nvmft_qpair_destroy(qp);
215 		return (EINVAL);
216 	}
217 	if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
218 		sx_sunlock(&np->lock);
219 		nvmft_printf(ctrlr,
220 		    "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
221 		    (int)sizeof(data->hostnqn), data->hostnqn);
222 		nvmft_connect_invalid_parameters(qp, cmd, true,
223 		    offsetof(struct nvmf_fabric_connect_data, hostnqn));
224 		nvmft_qpair_destroy(qp);
225 		return (EINVAL);
226 	}
227 
228 	/* XXX: Require handoff->trtype == ctrlr->trtype? */
229 
230 	mtx_lock(&ctrlr->lock);
231 	if (ctrlr->shutdown) {
232 		mtx_unlock(&ctrlr->lock);
233 		sx_sunlock(&np->lock);
234 		nvmft_printf(ctrlr,
235 		    "attempt to create I/O queue %u on disabled controller from %.*s\n",
236 		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
237 		nvmft_connect_invalid_parameters(qp, cmd, true,
238 		    offsetof(struct nvmf_fabric_connect_data, cntlid));
239 		nvmft_qpair_destroy(qp);
240 		return (EINVAL);
241 	}
242 	if (ctrlr->num_io_queues == 0) {
243 		mtx_unlock(&ctrlr->lock);
244 		sx_sunlock(&np->lock);
245 		nvmft_printf(ctrlr,
246 		    "attempt to create I/O queue %u without enabled queues from %.*s\n",
247 		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
248 		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
249 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
250 		nvmft_qpair_destroy(qp);
251 		return (EINVAL);
252 	}
253 	if (cmd->qid > ctrlr->num_io_queues) {
254 		mtx_unlock(&ctrlr->lock);
255 		sx_sunlock(&np->lock);
256 		nvmft_printf(ctrlr,
257 		    "attempt to create invalid I/O queue %u from %.*s\n", qid,
258 		    (int)sizeof(data->hostnqn), data->hostnqn);
259 		nvmft_connect_invalid_parameters(qp, cmd, false,
260 		    offsetof(struct nvmf_fabric_connect_cmd, qid));
261 		nvmft_qpair_destroy(qp);
262 		return (EINVAL);
263 	}
264 	if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
265 		mtx_unlock(&ctrlr->lock);
266 		sx_sunlock(&np->lock);
267 		nvmft_printf(ctrlr,
268 		    "attempt to re-create I/O queue %u from %.*s\n", qid,
269 		    (int)sizeof(data->hostnqn), data->hostnqn);
270 		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
271 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
272 		nvmft_qpair_destroy(qp);
273 		return (EINVAL);
274 	}
275 
276 	ctrlr->io_qpairs[qid - 1].qp = qp;
277 	mtx_unlock(&ctrlr->lock);
278 	nvmft_finish_accept(qp, cmd, ctrlr);
279 	sx_sunlock(&np->lock);
280 
281 	return (0);
282 }
283 
284 static void
285 nvmft_controller_shutdown(void *arg, int pending)
286 {
287 	struct nvmft_controller *ctrlr = arg;
288 
289 	MPASS(pending == 1);
290 
291 	/*
292 	 * Shutdown all I/O queues to terminate pending datamoves and
293 	 * stop receiving new commands.
294 	 */
295 	mtx_lock(&ctrlr->lock);
296 	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
297 		if (ctrlr->io_qpairs[i].qp != NULL) {
298 			ctrlr->io_qpairs[i].shutdown = true;
299 			mtx_unlock(&ctrlr->lock);
300 			nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
301 			mtx_lock(&ctrlr->lock);
302 		}
303 	}
304 	mtx_unlock(&ctrlr->lock);
305 
306 	/* Terminate active CTL commands. */
307 	nvmft_terminate_commands(ctrlr);
308 
309 	/* Wait for all pending CTL commands to complete. */
310 	mtx_lock(&ctrlr->lock);
311 	while (ctrlr->pending_commands != 0)
312 		mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
313 		    hz / 100);
314 	mtx_unlock(&ctrlr->lock);
315 
316 	/* Delete all of the I/O queues. */
317 	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
318 		if (ctrlr->io_qpairs[i].qp != NULL)
319 			nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
320 	}
321 	free(ctrlr->io_qpairs, M_NVMFT);
322 	ctrlr->io_qpairs = NULL;
323 
324 	mtx_lock(&ctrlr->lock);
325 	ctrlr->num_io_queues = 0;
326 
327 	/* Mark shutdown complete. */
328 	if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
329 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
330 		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
331 	}
332 
333 	if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
334 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
335 		ctrlr->shutdown = false;
336 	}
337 	mtx_unlock(&ctrlr->lock);
338 
339 	/*
340 	 * If the admin queue was closed while shutting down or a
341 	 * fatal controller error has occurred, terminate the
342 	 * association immediately, otherwise wait up to 2 minutes
343 	 * (NVMe-over-Fabrics 1.1 4.6).
344 	 */
345 	if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
346 		nvmft_controller_terminate(ctrlr, 0);
347 	else
348 		taskqueue_enqueue_timeout(taskqueue_thread,
349 		    &ctrlr->terminate_task, hz * 60 * 2);
350 }
351 
352 static void
353 nvmft_controller_terminate(void *arg, int pending)
354 {
355 	struct nvmft_controller *ctrlr = arg;
356 	struct nvmft_port *np;
357 	bool wakeup_np;
358 
359 	/* If the controller has been re-enabled, nothing to do. */
360 	mtx_lock(&ctrlr->lock);
361 	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
362 		mtx_unlock(&ctrlr->lock);
363 
364 		if (ctrlr->ka_sbt != 0)
365 			callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
366 			    C_HARDCLOCK);
367 		return;
368 	}
369 
370 	/* Disable updates to CC while destroying admin qpair. */
371 	ctrlr->shutdown = true;
372 	mtx_unlock(&ctrlr->lock);
373 
374 	nvmft_qpair_destroy(ctrlr->admin);
375 
376 	/* Remove association (CNTLID). */
377 	np = ctrlr->np;
378 	sx_xlock(&np->lock);
379 	TAILQ_REMOVE(&np->controllers, ctrlr, link);
380 	free_unr(np->ids, ctrlr->cntlid);
381 	wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
382 	sx_xunlock(&np->lock);
383 	if (wakeup_np)
384 		wakeup(np);
385 
386 	callout_drain(&ctrlr->ka_timer);
387 
388 	nvmft_printf(ctrlr, "association terminated\n");
389 	nvmft_controller_free(ctrlr);
390 	nvmft_port_rele(np);
391 }
392 
393 void
394 nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
395     int error)
396 {
397 	/*
398 	 * If a queue pair is closed, that isn't an error per se.
399 	 * That just means additional commands cannot be received on
400 	 * that queue pair.
401 	 *
402 	 * If the admin queue pair is closed while idle or while
403 	 * shutting down, terminate the association immediately.
404 	 *
405 	 * If an I/O queue pair is closed, just ignore it.
406 	 */
407 	if (error == 0) {
408 		if (qp != ctrlr->admin)
409 			return;
410 
411 		mtx_lock(&ctrlr->lock);
412 		if (ctrlr->shutdown) {
413 			ctrlr->admin_closed = true;
414 			mtx_unlock(&ctrlr->lock);
415 			return;
416 		}
417 
418 		if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
419 			MPASS(ctrlr->num_io_queues == 0);
420 			mtx_unlock(&ctrlr->lock);
421 
422 			/*
423 			 * Ok to drop lock here since ctrlr->cc can't
424 			 * change if the admin queue pair has closed.
425 			 * This also means no new queues can be handed
426 			 * off, etc.  Note that since there are no I/O
427 			 * queues, only the admin queue needs to be
428 			 * destroyed, so it is safe to skip
429 			 * nvmft_controller_shutdown and just schedule
430 			 * nvmft_controller_terminate.  Note that we
431 			 * cannot call nvmft_controller_terminate from
432 			 * here directly as this is called from the
433 			 * transport layer and freeing the admin qpair
434 			 * might deadlock waiting for the current
435 			 * thread to exit.
436 			 */
437 			if (taskqueue_cancel_timeout(taskqueue_thread,
438 			    &ctrlr->terminate_task, NULL) == 0)
439 				taskqueue_enqueue_timeout(taskqueue_thread,
440 				    &ctrlr->terminate_task, 0);
441 			return;
442 		}
443 
444 		/*
445 		 * Treat closing of the admin queue pair while enabled
446 		 * as a transport error.  Note that the admin queue
447 		 * pair has been closed.
448 		 */
449 		ctrlr->admin_closed = true;
450 	} else
451 		mtx_lock(&ctrlr->lock);
452 
453 	/* Ignore transport errors if we are already shutting down. */
454 	if (ctrlr->shutdown) {
455 		mtx_unlock(&ctrlr->lock);
456 		return;
457 	}
458 
459 	ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
460 	ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
461 	ctrlr->shutdown = true;
462 	mtx_unlock(&ctrlr->lock);
463 
464 	callout_stop(&ctrlr->ka_timer);
465 	taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
466 }
467 
468 /* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
469 static struct mbuf *
470 m_getml(size_t len, int how)
471 {
472 	struct mbuf *m, *n;
473 
474 	m = m_getm2(NULL, len, how, MT_DATA, 0);
475 	if (m == NULL)
476 		return (NULL);
477 	for (n = m; len > 0; n = n->m_next) {
478 		n->m_len = M_SIZE(n);
479 		if (n->m_len >= len) {
480 			n->m_len = len;
481 			MPASS(n->m_next == NULL);
482 		}
483 		len -= n->m_len;
484 	}
485 	return (m);
486 }
487 
488 static void
489 m_zero(struct mbuf *m, u_int offset, u_int len)
490 {
491 	u_int todo;
492 
493 	if (len == 0)
494 		return;
495 
496 	while (m->m_len <= offset) {
497 		offset -= m->m_len;
498 		m = m->m_next;
499 	}
500 
501 	todo = m->m_len - offset;
502 	if (todo > len)
503 		todo = len;
504 	memset(mtodo(m, offset), 0, todo);
505 	m = m->m_next;
506 	len -= todo;
507 
508 	while (len > 0) {
509 		todo = m->m_len;
510 		if (todo > len)
511 			todo = len;
512 		memset(mtod(m, void *), 0, todo);
513 		m = m->m_next;
514 		len -= todo;
515 	}
516 }
517 
518 static void
519 handle_get_log_page(struct nvmft_controller *ctrlr,
520     struct nvmf_capsule *nc, const struct nvme_command *cmd)
521 {
522 	struct mbuf *m;
523 	uint64_t offset;
524 	uint32_t numd;
525 	size_t len, todo;
526 	u_int status;
527 	uint8_t lid;
528 	bool rae;
529 
530 	lid = le32toh(cmd->cdw10) & 0xff;
531 	rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
532 	numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
533 	offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
534 
535 	if (offset % 3 != 0) {
536 		status = NVME_SC_INVALID_FIELD;
537 		goto done;
538 	}
539 
540 	len = (numd + 1) * 4;
541 
542 	switch (lid) {
543 	case NVME_LOG_ERROR:
544 		todo = 0;
545 
546 		m = m_getml(len, M_WAITOK);
547 		if (todo != len)
548 			m_zero(m, todo, len - todo);
549 		status = nvmf_send_controller_data(nc, 0, m, len);
550 		MPASS(status != NVMF_MORE);
551 		break;
552 	case NVME_LOG_HEALTH_INFORMATION:
553 	{
554 		struct nvme_health_information_page hip;
555 
556 		if (offset >= sizeof(hip)) {
557 			status = NVME_SC_INVALID_FIELD;
558 			goto done;
559 		}
560 		todo = sizeof(hip) - offset;
561 		if (todo > len)
562 			todo = len;
563 
564 		mtx_lock(&ctrlr->lock);
565 		hip = ctrlr->hip;
566 		hip.controller_busy_time[0] =
567 		    sbintime_getsec(ctrlr->busy_total) / 60;
568 		hip.power_on_hours[0] =
569 		    sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
570 		mtx_unlock(&ctrlr->lock);
571 
572 		m = m_getml(len, M_WAITOK);
573 		m_copyback(m, 0, todo, (char *)&hip + offset);
574 		if (todo != len)
575 			m_zero(m, todo, len - todo);
576 		status = nvmf_send_controller_data(nc, 0, m, len);
577 		MPASS(status != NVMF_MORE);
578 		break;
579 	}
580 	case NVME_LOG_FIRMWARE_SLOT:
581 		if (offset >= sizeof(ctrlr->np->fp)) {
582 			status = NVME_SC_INVALID_FIELD;
583 			goto done;
584 		}
585 		todo = sizeof(ctrlr->np->fp) - offset;
586 		if (todo > len)
587 			todo = len;
588 
589 		m = m_getml(len, M_WAITOK);
590 		m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
591 		if (todo != len)
592 			m_zero(m, todo, len - todo);
593 		status = nvmf_send_controller_data(nc, 0, m, len);
594 		MPASS(status != NVMF_MORE);
595 		break;
596 	case NVME_LOG_CHANGED_NAMESPACE:
597 		if (offset >= sizeof(*ctrlr->changed_ns)) {
598 			status = NVME_SC_INVALID_FIELD;
599 			goto done;
600 		}
601 		todo = sizeof(*ctrlr->changed_ns) - offset;
602 		if (todo > len)
603 			todo = len;
604 
605 		m = m_getml(len, M_WAITOK);
606 		mtx_lock(&ctrlr->lock);
607 		m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
608 		if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
609 			memset(ctrlr->changed_ns, 0,
610 			    sizeof(*ctrlr->changed_ns));
611 		if (!rae)
612 			ctrlr->changed_ns_reported = false;
613 		mtx_unlock(&ctrlr->lock);
614 		if (todo != len)
615 			m_zero(m, todo, len - todo);
616 		status = nvmf_send_controller_data(nc, 0, m, len);
617 		MPASS(status != NVMF_MORE);
618 		break;
619 	default:
620 		nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
621 		    lid);
622 		status = NVME_SC_INVALID_FIELD;
623 		break;
624 	}
625 
626 done:
627 	if (status == NVMF_SUCCESS_SENT)
628 		nvmft_command_completed(ctrlr->admin, nc);
629 	else
630 		nvmft_send_generic_error(ctrlr->admin, nc, status);
631 	nvmf_free_capsule(nc);
632 }
633 
634 static void
635 m_free_nslist(struct mbuf *m)
636 {
637 	free(m->m_ext.ext_arg1, M_NVMFT);
638 }
639 
640 static void
641 handle_identify_command(struct nvmft_controller *ctrlr,
642     struct nvmf_capsule *nc, const struct nvme_command *cmd)
643 {
644 	struct mbuf *m;
645 	size_t data_len;
646 	u_int status;
647 	uint8_t cns;
648 
649 	cns = le32toh(cmd->cdw10) & 0xFF;
650 	data_len = nvmf_capsule_data_len(nc);
651 	if (data_len != sizeof(ctrlr->cdata)) {
652 		nvmft_printf(ctrlr,
653 		    "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
654 		    cns);
655 		nvmft_send_generic_error(ctrlr->admin, nc,
656 		    NVME_SC_INVALID_OPCODE);
657 		nvmf_free_capsule(nc);
658 		return;
659 	}
660 
661 	switch (cns) {
662 	case 0:	/* Namespace data. */
663 	case 3:	/* Namespace Identification Descriptor list. */
664 		nvmft_dispatch_command(ctrlr->admin, nc, true);
665 		return;
666 	case 1:
667 		/* Controller data. */
668 		m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
669 		m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
670 		status = nvmf_send_controller_data(nc, 0, m,
671 		    sizeof(ctrlr->cdata));
672 		MPASS(status != NVMF_MORE);
673 		break;
674 	case 2:
675 	{
676 		/* Active namespace list. */
677 		struct nvme_ns_list *nslist;
678 		uint32_t nsid;
679 
680 		nsid = le32toh(cmd->nsid);
681 		if (nsid >= 0xfffffffe) {
682 			status = NVME_SC_INVALID_FIELD;
683 			break;
684 		}
685 
686 		nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
687 		nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
688 		m = m_get(M_WAITOK, MT_DATA);
689 		m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
690 		    nslist, NULL, 0, EXT_CTL);
691 		m->m_len = sizeof(*nslist);
692 		status = nvmf_send_controller_data(nc, 0, m, m->m_len);
693 		MPASS(status != NVMF_MORE);
694 		break;
695 	}
696 	default:
697 		nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
698 		status = NVME_SC_INVALID_FIELD;
699 		break;
700 	}
701 
702 	if (status == NVMF_SUCCESS_SENT)
703 		nvmft_command_completed(ctrlr->admin, nc);
704 	else
705 		nvmft_send_generic_error(ctrlr->admin, nc, status);
706 	nvmf_free_capsule(nc);
707 }
708 
709 static void
710 handle_set_features(struct nvmft_controller *ctrlr,
711     struct nvmf_capsule *nc, const struct nvme_command *cmd)
712 {
713 	struct nvme_completion cqe;
714 	uint8_t fid;
715 
716 	fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
717 	switch (fid) {
718 	case NVME_FEAT_NUMBER_OF_QUEUES:
719 	{
720 		uint32_t num_queues;
721 		struct nvmft_io_qpair *io_qpairs;
722 
723 		num_queues = le32toh(cmd->cdw11) & 0xffff;
724 
725 		/* 5.12.1.7: 65535 is invalid. */
726 		if (num_queues == 65535)
727 			goto error;
728 
729 		/* Fabrics requires the same number of SQs and CQs. */
730 		if (le32toh(cmd->cdw11) >> 16 != num_queues)
731 			goto error;
732 
733 		/* Convert to 1's based */
734 		num_queues++;
735 
736 		io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
737 		    M_NVMFT, M_WAITOK | M_ZERO);
738 
739 		mtx_lock(&ctrlr->lock);
740 		if (ctrlr->num_io_queues != 0) {
741 			mtx_unlock(&ctrlr->lock);
742 			free(io_qpairs, M_NVMFT);
743 			nvmft_send_generic_error(ctrlr->admin, nc,
744 			    NVME_SC_COMMAND_SEQUENCE_ERROR);
745 			nvmf_free_capsule(nc);
746 			return;
747 		}
748 
749 		ctrlr->num_io_queues = num_queues;
750 		ctrlr->io_qpairs = io_qpairs;
751 		mtx_unlock(&ctrlr->lock);
752 
753 		nvmft_init_cqe(&cqe, nc, 0);
754 		cqe.cdw0 = cmd->cdw11;
755 		nvmft_send_response(ctrlr->admin, &cqe);
756 		nvmf_free_capsule(nc);
757 		return;
758 	}
759 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
760 	{
761 		uint32_t aer_mask;
762 
763 		aer_mask = le32toh(cmd->cdw11);
764 
765 		/* Check for any reserved or unimplemented feature bits. */
766 		if ((aer_mask & 0xffffc000) != 0)
767 			goto error;
768 
769 		mtx_lock(&ctrlr->lock);
770 		ctrlr->aer_mask = aer_mask;
771 		mtx_unlock(&ctrlr->lock);
772 		nvmft_send_success(ctrlr->admin, nc);
773 		return;
774 	}
775 	default:
776 		nvmft_printf(ctrlr,
777 		    "Unsupported feature ID %u for SET_FEATURES\n", fid);
778 		goto error;
779 	}
780 
781 error:
782 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
783 	nvmf_free_capsule(nc);
784 }
785 
786 static bool
787 update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
788 {
789 	struct nvmft_port *np = ctrlr->np;
790 	uint32_t changes;
791 
792 	*need_shutdown = false;
793 
794 	mtx_lock(&ctrlr->lock);
795 
796 	/* Don't allow any changes while shutting down. */
797 	if (ctrlr->shutdown) {
798 		mtx_unlock(&ctrlr->lock);
799 		return (false);
800 	}
801 
802 	if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
803 		mtx_unlock(&ctrlr->lock);
804 		return (false);
805 	}
806 
807 	changes = ctrlr->cc ^ new_cc;
808 	ctrlr->cc = new_cc;
809 
810 	/* Handle shutdown requests. */
811 	if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
812 	    NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
813 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
814 		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
815 		ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
816 		ctrlr->shutdown = true;
817 		*need_shutdown = true;
818 		nvmft_printf(ctrlr, "shutdown requested\n");
819 	}
820 
821 	if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
822 		if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
823 			/* Controller reset. */
824 			nvmft_printf(ctrlr, "reset requested\n");
825 			ctrlr->shutdown = true;
826 			*need_shutdown = true;
827 		} else
828 			ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
829 	}
830 	mtx_unlock(&ctrlr->lock);
831 
832 	return (true);
833 }
834 
835 static void
836 handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
837     const struct nvmf_fabric_prop_get_cmd *pget)
838 {
839 	struct nvmf_fabric_prop_get_rsp rsp;
840 
841 	nvmft_init_cqe(&rsp, nc, 0);
842 
843 	switch (le32toh(pget->ofst)) {
844 	case NVMF_PROP_CAP:
845 		if (pget->attrib.size != NVMF_PROP_SIZE_8)
846 			goto error;
847 		rsp.value.u64 = htole64(ctrlr->np->cap);
848 		break;
849 	case NVMF_PROP_VS:
850 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
851 			goto error;
852 		rsp.value.u32.low = ctrlr->cdata.ver;
853 		break;
854 	case NVMF_PROP_CC:
855 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
856 			goto error;
857 		rsp.value.u32.low = htole32(ctrlr->cc);
858 		break;
859 	case NVMF_PROP_CSTS:
860 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
861 			goto error;
862 		rsp.value.u32.low = htole32(ctrlr->csts);
863 		break;
864 	default:
865 		goto error;
866 	}
867 
868 	nvmft_send_response(ctrlr->admin, &rsp);
869 	return;
870 error:
871 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
872 }
873 
874 static void
875 handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
876     const struct nvmf_fabric_prop_set_cmd *pset)
877 {
878 	bool need_shutdown;
879 
880 	need_shutdown = false;
881 	switch (le32toh(pset->ofst)) {
882 	case NVMF_PROP_CC:
883 		if (pset->attrib.size != NVMF_PROP_SIZE_4)
884 			goto error;
885 		if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
886 		    &need_shutdown))
887 			goto error;
888 		break;
889 	default:
890 		goto error;
891 	}
892 
893 	nvmft_send_success(ctrlr->admin, nc);
894 	if (need_shutdown) {
895 		callout_stop(&ctrlr->ka_timer);
896 		taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
897 	}
898 	return;
899 error:
900 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
901 }
902 
903 static void
904 handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
905     struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
906 {
907 	switch (fc->fctype) {
908 	case NVMF_FABRIC_COMMAND_PROPERTY_GET:
909 		handle_property_get(ctrlr, nc,
910 		    (const struct nvmf_fabric_prop_get_cmd *)fc);
911 		break;
912 	case NVMF_FABRIC_COMMAND_PROPERTY_SET:
913 		handle_property_set(ctrlr, nc,
914 		    (const struct nvmf_fabric_prop_set_cmd *)fc);
915 		break;
916 	case NVMF_FABRIC_COMMAND_CONNECT:
917 		nvmft_printf(ctrlr,
918 		    "CONNECT command on connected admin queue\n");
919 		nvmft_send_generic_error(ctrlr->admin, nc,
920 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
921 		break;
922 	case NVMF_FABRIC_COMMAND_DISCONNECT:
923 		nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
924 		nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
925 		    NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
926 		break;
927 	default:
928 		nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
929 		    fc->fctype);
930 		nvmft_send_generic_error(ctrlr->admin, nc,
931 		    NVME_SC_INVALID_OPCODE);
932 		break;
933 	}
934 	nvmf_free_capsule(nc);
935 }
936 
937 void
938 nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
939     struct nvmf_capsule *nc)
940 {
941 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
942 
943 	/* Only permit Fabrics commands while a controller is disabled. */
944 	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
945 	    cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
946 		nvmft_printf(ctrlr,
947 		    "Unsupported admin opcode %#x whiled disabled\n", cmd->opc);
948 		nvmft_send_generic_error(ctrlr->admin, nc,
949 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
950 		nvmf_free_capsule(nc);
951 		return;
952 	}
953 
954 	atomic_store_int(&ctrlr->ka_active_traffic, 1);
955 
956 	switch (cmd->opc) {
957 	case NVME_OPC_GET_LOG_PAGE:
958 		handle_get_log_page(ctrlr, nc, cmd);
959 		break;
960 	case NVME_OPC_IDENTIFY:
961 		handle_identify_command(ctrlr, nc, cmd);
962 		break;
963 	case NVME_OPC_SET_FEATURES:
964 		handle_set_features(ctrlr, nc, cmd);
965 		break;
966 	case NVME_OPC_ASYNC_EVENT_REQUEST:
967 		mtx_lock(&ctrlr->lock);
968 		if (ctrlr->aer_pending == NVMFT_NUM_AER) {
969 			mtx_unlock(&ctrlr->lock);
970 			nvmft_send_error(ctrlr->admin, nc,
971 			    NVME_SCT_COMMAND_SPECIFIC,
972 			    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
973 		} else {
974 			/* NB: Store the CID without byte-swapping. */
975 			ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
976 			ctrlr->aer_pending++;
977 			ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
978 			mtx_unlock(&ctrlr->lock);
979 		}
980 		nvmf_free_capsule(nc);
981 		break;
982 	case NVME_OPC_KEEP_ALIVE:
983 		nvmft_send_success(ctrlr->admin, nc);
984 		nvmf_free_capsule(nc);
985 		break;
986 	case NVME_OPC_FABRICS_COMMANDS:
987 		handle_admin_fabrics_command(ctrlr, nc,
988 		    (const struct nvmf_fabric_cmd *)cmd);
989 		break;
990 	default:
991 		nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
992 		nvmft_send_generic_error(ctrlr->admin, nc,
993 		    NVME_SC_INVALID_OPCODE);
994 		nvmf_free_capsule(nc);
995 		break;
996 	}
997 }
998 
999 void
1000 nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
1001     struct nvmf_capsule *nc)
1002 {
1003 	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
1004 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
1005 
1006 	atomic_store_int(&ctrlr->ka_active_traffic, 1);
1007 
1008 	switch (cmd->opc) {
1009 	case NVME_OPC_FLUSH:
1010 		if (cmd->nsid == htole32(0xffffffff)) {
1011 			nvmft_send_generic_error(qp, nc,
1012 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1013 			nvmf_free_capsule(nc);
1014 			break;
1015 		}
1016 		/* FALLTHROUGH */
1017 	case NVME_OPC_WRITE:
1018 	case NVME_OPC_READ:
1019 	case NVME_OPC_WRITE_UNCORRECTABLE:
1020 	case NVME_OPC_COMPARE:
1021 	case NVME_OPC_WRITE_ZEROES:
1022 	case NVME_OPC_DATASET_MANAGEMENT:
1023 	case NVME_OPC_VERIFY:
1024 		nvmft_dispatch_command(qp, nc, false);
1025 		break;
1026 	default:
1027 		nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
1028 		nvmft_send_generic_error(qp, nc,
1029 		    NVME_SC_INVALID_OPCODE);
1030 		nvmf_free_capsule(nc);
1031 		break;
1032 	}
1033 }
1034 
1035 static void
1036 nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
1037     u_int type, uint8_t info, uint8_t log_page_id)
1038 {
1039 	struct nvme_completion cpl;
1040 
1041 	MPASS(type <= 7);
1042 
1043 	/* Drop events that are not enabled. */
1044 	mtx_lock(&ctrlr->lock);
1045 	if ((ctrlr->aer_mask & aer_mask) == 0) {
1046 		mtx_unlock(&ctrlr->lock);
1047 		return;
1048 	}
1049 
1050 	/*
1051 	 * If there is no pending AER command, drop it.
1052 	 * XXX: Should we queue these?
1053 	 */
1054 	if (ctrlr->aer_pending == 0) {
1055 		mtx_unlock(&ctrlr->lock);
1056 		nvmft_printf(ctrlr,
1057 		    "dropping AER type %u, info %#x, page %#x\n",
1058 		    type, info, log_page_id);
1059 		return;
1060 	}
1061 
1062 	memset(&cpl, 0, sizeof(cpl));
1063 	cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
1064 	ctrlr->aer_pending--;
1065 	ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
1066 	mtx_unlock(&ctrlr->lock);
1067 
1068 	cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
1069 	    NVMEF(NVME_ASYNC_EVENT_INFO, info) |
1070 	    NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
1071 
1072 	nvmft_send_response(ctrlr->admin, &cpl);
1073 }
1074 
1075 void
1076 nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
1077 {
1078 	struct nvme_ns_list *nslist;
1079 	uint32_t new_nsid, nsid;
1080 	u_int i;
1081 
1082 	new_nsid = lun_id + 1;
1083 
1084 	mtx_lock(&ctrlr->lock);
1085 	nslist = ctrlr->changed_ns;
1086 
1087 	/* If the first entry is 0xffffffff, the list is already full. */
1088 	if (nslist->ns[0] != 0xffffffff) {
1089 		/* Find the insertion point for this namespace ID. */
1090 		for (i = 0; i < nitems(nslist->ns); i++) {
1091 			nsid = le32toh(nslist->ns[i]);
1092 			if (nsid == new_nsid) {
1093 				/* Already reported, nothing to do. */
1094 				mtx_unlock(&ctrlr->lock);
1095 				return;
1096 			}
1097 
1098 			if (nsid == 0 || nsid > new_nsid)
1099 				break;
1100 		}
1101 
1102 		if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
1103 			/* List is full. */
1104 			memset(ctrlr->changed_ns, 0,
1105 			    sizeof(*ctrlr->changed_ns));
1106 			ctrlr->changed_ns->ns[0] = 0xffffffff;
1107 		} else if (nslist->ns[i] == htole32(0)) {
1108 			/*
1109 			 * Optimize case where this ID is appended to
1110 			 * the end.
1111 			 */
1112 			nslist->ns[i] = htole32(new_nsid);
1113 		} else {
1114 			memmove(&nslist->ns[i + 1], &nslist->ns[i],
1115 			    (nitems(nslist->ns) - i - 1) *
1116 			    sizeof(nslist->ns[0]));
1117 			nslist->ns[i] = htole32(new_nsid);
1118 		}
1119 	}
1120 
1121 	if (ctrlr->changed_ns_reported) {
1122 		mtx_unlock(&ctrlr->lock);
1123 		return;
1124 	}
1125 	ctrlr->changed_ns_reported = true;
1126 	mtx_unlock(&ctrlr->lock);
1127 
1128 	nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
1129 	    NVME_LOG_CHANGED_NAMESPACE);
1130 }
1131