xref: /freebsd/sys/dev/nvmf/controller/nvmft_controller.c (revision bd66c1b43e33540205dbc1187c2f2a15c58b57ba)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/callout.h>
10 #include <sys/kernel.h>
11 #include <sys/lock.h>
12 #include <sys/malloc.h>
13 #include <sys/mbuf.h>
14 #include <sys/memdesc.h>
15 #include <sys/mutex.h>
16 #include <sys/sbuf.h>
17 #include <sys/sx.h>
18 #include <sys/taskqueue.h>
19 
20 #include <dev/nvmf/nvmf_transport.h>
21 #include <dev/nvmf/controller/nvmft_subr.h>
22 #include <dev/nvmf/controller/nvmft_var.h>
23 
24 static void	nvmft_controller_shutdown(void *arg, int pending);
25 static void	nvmft_controller_terminate(void *arg, int pending);
26 
27 int
28 nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
29 {
30 	char buf[128];
31 	struct sbuf sb;
32 	va_list ap;
33 	size_t retval;
34 
35 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
36 	sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
37 
38 	sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
39 
40 	va_start(ap, fmt);
41 	sbuf_vprintf(&sb, fmt, ap);
42 	va_end(ap);
43 
44 	sbuf_finish(&sb);
45 	sbuf_delete(&sb);
46 
47 	return (retval);
48 }
49 
50 static struct nvmft_controller *
51 nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
52     const struct nvmf_fabric_connect_data *data)
53 {
54 	struct nvmft_controller *ctrlr;
55 
56 	ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
57 	ctrlr->cntlid = cntlid;
58 	nvmft_port_ref(np);
59 	TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
60 	ctrlr->np = np;
61 	mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
62 	callout_init(&ctrlr->ka_timer, 1);
63 	TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
64 	TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
65 	    nvmft_controller_terminate, ctrlr);
66 
67 	ctrlr->cdata = np->cdata;
68 	ctrlr->cdata.ctrlr_id = htole16(cntlid);
69 	memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
70 	memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
71 	ctrlr->hip.power_cycles[0] = 1;
72 	ctrlr->create_time = sbinuptime();
73 
74 	ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
75 	    M_WAITOK | M_ZERO);
76 
77 	return (ctrlr);
78 }
79 
80 static void
81 nvmft_controller_free(struct nvmft_controller *ctrlr)
82 {
83 	mtx_destroy(&ctrlr->lock);
84 	MPASS(ctrlr->io_qpairs == NULL);
85 	free(ctrlr->changed_ns, M_NVMFT);
86 	free(ctrlr, M_NVMFT);
87 }
88 
89 static void
90 nvmft_keep_alive_timer(void *arg)
91 {
92 	struct nvmft_controller *ctrlr = arg;
93 	int traffic;
94 
95 	if (ctrlr->shutdown)
96 		return;
97 
98 	traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
99 	if (traffic == 0) {
100 		nvmft_printf(ctrlr,
101 		    "disconnecting due to KeepAlive timeout\n");
102 		nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
103 		return;
104 	}
105 
106 	callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
107 }
108 
109 int
110 nvmft_handoff_admin_queue(struct nvmft_port *np,
111     const struct nvmf_handoff_controller_qpair *handoff,
112     const struct nvmf_fabric_connect_cmd *cmd,
113     const struct nvmf_fabric_connect_data *data)
114 {
115 	struct nvmft_controller *ctrlr;
116 	struct nvmft_qpair *qp;
117 	uint32_t kato;
118 	int cntlid;
119 
120 	if (cmd->qid != htole16(0))
121 		return (EINVAL);
122 
123 	qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
124 	    "admin queue");
125 	if (qp == NULL) {
126 		printf("NVMFT: Failed to setup admin queue from %.*s\n",
127 		    (int)sizeof(data->hostnqn), data->hostnqn);
128 		return (ENXIO);
129 	}
130 
131 	sx_xlock(&np->lock);
132 	cntlid = alloc_unr(np->ids);
133 	if (cntlid == -1) {
134 		sx_xunlock(&np->lock);
135 		printf("NVMFT: Unable to allocate controller for %.*s\n",
136 		    (int)sizeof(data->hostnqn), data->hostnqn);
137 		nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
138 		    NVMF_FABRIC_SC_INVALID_HOST);
139 		nvmft_qpair_destroy(qp);
140 		return (ENOMEM);
141 	}
142 
143 #ifdef INVARIANTS
144 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
145 		KASSERT(ctrlr->cntlid != cntlid,
146 		    ("%s: duplicate controllers with id %d", __func__, cntlid));
147 	}
148 #endif
149 
150 	ctrlr = nvmft_controller_alloc(np, cntlid, data);
151 	nvmft_printf(ctrlr, "associated with %.*s\n",
152 	    (int)sizeof(data->hostnqn), data->hostnqn);
153 	ctrlr->admin = qp;
154 	ctrlr->trtype = handoff->trtype;
155 
156 	/*
157 	 * The spec requires a non-zero KeepAlive timer, but allow a
158 	 * zero KATO value to match Linux.
159 	 */
160 	kato = le32toh(cmd->kato);
161 	if (kato != 0) {
162 		/*
163 		 * Round up to 1 second matching granularity
164 		 * advertised in cdata.
165 		 */
166 		ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
167 		callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
168 		    nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
169 	}
170 
171 	nvmft_finish_accept(qp, cmd, ctrlr);
172 	sx_xunlock(&np->lock);
173 
174 	return (0);
175 }
176 
177 int
178 nvmft_handoff_io_queue(struct nvmft_port *np,
179     const struct nvmf_handoff_controller_qpair *handoff,
180     const struct nvmf_fabric_connect_cmd *cmd,
181     const struct nvmf_fabric_connect_data *data)
182 {
183 	struct nvmft_controller *ctrlr;
184 	struct nvmft_qpair *qp;
185 	char name[16];
186 	uint16_t cntlid, qid;
187 
188 	qid = le16toh(cmd->qid);
189 	if (qid == 0)
190 		return (EINVAL);
191 	cntlid = le16toh(data->cntlid);
192 
193 	snprintf(name, sizeof(name), "I/O queue %u", qid);
194 	qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
195 	if (qp == NULL) {
196 		printf("NVMFT: Failed to setup I/O queue %u from %.*s\n", qid,
197 		    (int)sizeof(data->hostnqn), data->hostnqn);
198 		return (ENXIO);
199 	}
200 
201 	sx_slock(&np->lock);
202 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
203 		if (ctrlr->cntlid == cntlid)
204 			break;
205 	}
206 	if (ctrlr == NULL) {
207 		sx_sunlock(&np->lock);
208 		printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
209 		    ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
210 		    data->hostnqn);
211 		nvmft_connect_invalid_parameters(qp, cmd, true,
212 		    offsetof(struct nvmf_fabric_connect_data, cntlid));
213 		nvmft_qpair_destroy(qp);
214 		return (ENOENT);
215 	}
216 
217 	if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
218 		sx_sunlock(&np->lock);
219 		nvmft_printf(ctrlr,
220 		    "hostid mismatch for I/O queue %u from %.*s\n", qid,
221 		    (int)sizeof(data->hostnqn), data->hostnqn);
222 		nvmft_connect_invalid_parameters(qp, cmd, true,
223 		    offsetof(struct nvmf_fabric_connect_data, hostid));
224 		nvmft_qpair_destroy(qp);
225 		return (EINVAL);
226 	}
227 	if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
228 		sx_sunlock(&np->lock);
229 		nvmft_printf(ctrlr,
230 		    "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
231 		    (int)sizeof(data->hostnqn), data->hostnqn);
232 		nvmft_connect_invalid_parameters(qp, cmd, true,
233 		    offsetof(struct nvmf_fabric_connect_data, hostnqn));
234 		nvmft_qpair_destroy(qp);
235 		return (EINVAL);
236 	}
237 
238 	/* XXX: Require handoff->trtype == ctrlr->trtype? */
239 
240 	mtx_lock(&ctrlr->lock);
241 	if (ctrlr->shutdown) {
242 		mtx_unlock(&ctrlr->lock);
243 		sx_sunlock(&np->lock);
244 		nvmft_printf(ctrlr,
245 		    "attempt to create I/O queue %u on disabled controller from %.*s\n",
246 		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
247 		nvmft_connect_invalid_parameters(qp, cmd, true,
248 		    offsetof(struct nvmf_fabric_connect_data, cntlid));
249 		nvmft_qpair_destroy(qp);
250 		return (EINVAL);
251 	}
252 	if (ctrlr->num_io_queues == 0) {
253 		mtx_unlock(&ctrlr->lock);
254 		sx_sunlock(&np->lock);
255 		nvmft_printf(ctrlr,
256 		    "attempt to create I/O queue %u without enabled queues from %.*s\n",
257 		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
258 		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
259 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
260 		nvmft_qpair_destroy(qp);
261 		return (EINVAL);
262 	}
263 	if (cmd->qid > ctrlr->num_io_queues) {
264 		mtx_unlock(&ctrlr->lock);
265 		sx_sunlock(&np->lock);
266 		nvmft_printf(ctrlr,
267 		    "attempt to create invalid I/O queue %u from %.*s\n", qid,
268 		    (int)sizeof(data->hostnqn), data->hostnqn);
269 		nvmft_connect_invalid_parameters(qp, cmd, false,
270 		    offsetof(struct nvmf_fabric_connect_cmd, qid));
271 		nvmft_qpair_destroy(qp);
272 		return (EINVAL);
273 	}
274 	if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
275 		mtx_unlock(&ctrlr->lock);
276 		sx_sunlock(&np->lock);
277 		nvmft_printf(ctrlr,
278 		    "attempt to re-create I/O queue %u from %.*s\n", qid,
279 		    (int)sizeof(data->hostnqn), data->hostnqn);
280 		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
281 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
282 		nvmft_qpair_destroy(qp);
283 		return (EINVAL);
284 	}
285 
286 	ctrlr->io_qpairs[qid - 1].qp = qp;
287 	mtx_unlock(&ctrlr->lock);
288 	nvmft_finish_accept(qp, cmd, ctrlr);
289 	sx_sunlock(&np->lock);
290 
291 	return (0);
292 }
293 
294 static void
295 nvmft_controller_shutdown(void *arg, int pending)
296 {
297 	struct nvmft_controller *ctrlr = arg;
298 
299 	MPASS(pending == 1);
300 
301 	/*
302 	 * Shutdown all I/O queues to terminate pending datamoves and
303 	 * stop receiving new commands.
304 	 */
305 	mtx_lock(&ctrlr->lock);
306 	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
307 		if (ctrlr->io_qpairs[i].qp != NULL) {
308 			ctrlr->io_qpairs[i].shutdown = true;
309 			mtx_unlock(&ctrlr->lock);
310 			nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
311 			mtx_lock(&ctrlr->lock);
312 		}
313 	}
314 	mtx_unlock(&ctrlr->lock);
315 
316 	/* Terminate active CTL commands. */
317 	nvmft_terminate_commands(ctrlr);
318 
319 	/* Wait for all pending CTL commands to complete. */
320 	mtx_lock(&ctrlr->lock);
321 	while (ctrlr->pending_commands != 0)
322 		mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
323 		    hz / 100);
324 	mtx_unlock(&ctrlr->lock);
325 
326 	/* Delete all of the I/O queues. */
327 	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
328 		if (ctrlr->io_qpairs[i].qp != NULL)
329 			nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
330 	}
331 	free(ctrlr->io_qpairs, M_NVMFT);
332 	ctrlr->io_qpairs = NULL;
333 
334 	mtx_lock(&ctrlr->lock);
335 	ctrlr->num_io_queues = 0;
336 
337 	/* Mark shutdown complete. */
338 	if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
339 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
340 		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
341 	}
342 
343 	if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
344 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
345 		ctrlr->shutdown = false;
346 	}
347 	mtx_unlock(&ctrlr->lock);
348 
349 	/*
350 	 * If the admin queue was closed while shutting down or a
351 	 * fatal controller error has occurred, terminate the
352 	 * association immediately, otherwise wait up to 2 minutes
353 	 * (NVMe-over-Fabrics 1.1 4.6).
354 	 */
355 	if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
356 		nvmft_controller_terminate(ctrlr, 0);
357 	else
358 		taskqueue_enqueue_timeout(taskqueue_thread,
359 		    &ctrlr->terminate_task, hz * 60 * 2);
360 }
361 
362 static void
363 nvmft_controller_terminate(void *arg, int pending)
364 {
365 	struct nvmft_controller *ctrlr = arg;
366 	struct nvmft_port *np;
367 	bool wakeup_np;
368 
369 	/* If the controller has been re-enabled, nothing to do. */
370 	mtx_lock(&ctrlr->lock);
371 	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
372 		mtx_unlock(&ctrlr->lock);
373 
374 		if (ctrlr->ka_sbt != 0)
375 			callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
376 			    C_HARDCLOCK);
377 		return;
378 	}
379 
380 	/* Disable updates to CC while destroying admin qpair. */
381 	ctrlr->shutdown = true;
382 	mtx_unlock(&ctrlr->lock);
383 
384 	nvmft_qpair_destroy(ctrlr->admin);
385 
386 	/* Remove association (CNTLID). */
387 	np = ctrlr->np;
388 	sx_xlock(&np->lock);
389 	TAILQ_REMOVE(&np->controllers, ctrlr, link);
390 	free_unr(np->ids, ctrlr->cntlid);
391 	wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
392 	sx_xunlock(&np->lock);
393 	if (wakeup_np)
394 		wakeup(np);
395 
396 	callout_drain(&ctrlr->ka_timer);
397 
398 	nvmft_printf(ctrlr, "association terminated\n");
399 	nvmft_controller_free(ctrlr);
400 	nvmft_port_rele(np);
401 }
402 
403 void
404 nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
405     int error)
406 {
407 	/*
408 	 * If a queue pair is closed, that isn't an error per se.
409 	 * That just means additional commands cannot be received on
410 	 * that queue pair.
411 	 *
412 	 * If the admin queue pair is closed while idle or while
413 	 * shutting down, terminate the association immediately.
414 	 *
415 	 * If an I/O queue pair is closed, just ignore it.
416 	 */
417 	if (error == 0) {
418 		if (qp != ctrlr->admin)
419 			return;
420 
421 		mtx_lock(&ctrlr->lock);
422 		if (ctrlr->shutdown) {
423 			ctrlr->admin_closed = true;
424 			mtx_unlock(&ctrlr->lock);
425 			return;
426 		}
427 
428 		if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
429 			MPASS(ctrlr->num_io_queues == 0);
430 			mtx_unlock(&ctrlr->lock);
431 
432 			/*
433 			 * Ok to drop lock here since ctrlr->cc can't
434 			 * change if the admin queue pair has closed.
435 			 * This also means no new queues can be handed
436 			 * off, etc.  Note that since there are no I/O
437 			 * queues, only the admin queue needs to be
438 			 * destroyed, so it is safe to skip
439 			 * nvmft_controller_shutdown and just schedule
440 			 * nvmft_controller_terminate.  Note that we
441 			 * cannot call nvmft_controller_terminate from
442 			 * here directly as this is called from the
443 			 * transport layer and freeing the admin qpair
444 			 * might deadlock waiting for the current
445 			 * thread to exit.
446 			 */
447 			if (taskqueue_cancel_timeout(taskqueue_thread,
448 			    &ctrlr->terminate_task, NULL) == 0)
449 				taskqueue_enqueue_timeout(taskqueue_thread,
450 				    &ctrlr->terminate_task, 0);
451 			return;
452 		}
453 
454 		/*
455 		 * Treat closing of the admin queue pair while enabled
456 		 * as a transport error.  Note that the admin queue
457 		 * pair has been closed.
458 		 */
459 		ctrlr->admin_closed = true;
460 	} else
461 		mtx_lock(&ctrlr->lock);
462 
463 	/* Ignore transport errors if we are already shutting down. */
464 	if (ctrlr->shutdown) {
465 		mtx_unlock(&ctrlr->lock);
466 		return;
467 	}
468 
469 	ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
470 	ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
471 	ctrlr->shutdown = true;
472 	mtx_unlock(&ctrlr->lock);
473 
474 	callout_stop(&ctrlr->ka_timer);
475 	taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
476 }
477 
478 /* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
479 static struct mbuf *
480 m_getml(size_t len, int how)
481 {
482 	struct mbuf *m, *n;
483 
484 	m = m_getm2(NULL, len, how, MT_DATA, 0);
485 	if (m == NULL)
486 		return (NULL);
487 	for (n = m; len > 0; n = n->m_next) {
488 		n->m_len = M_SIZE(n);
489 		if (n->m_len >= len) {
490 			n->m_len = len;
491 			MPASS(n->m_next == NULL);
492 		}
493 		len -= n->m_len;
494 	}
495 	return (m);
496 }
497 
498 static void
499 m_zero(struct mbuf *m, u_int offset, u_int len)
500 {
501 	u_int todo;
502 
503 	if (len == 0)
504 		return;
505 
506 	while (m->m_len <= offset) {
507 		offset -= m->m_len;
508 		m = m->m_next;
509 	}
510 
511 	todo = m->m_len - offset;
512 	if (todo > len)
513 		todo = len;
514 	memset(mtodo(m, offset), 0, todo);
515 	m = m->m_next;
516 	len -= todo;
517 
518 	while (len > 0) {
519 		todo = m->m_len;
520 		if (todo > len)
521 			todo = len;
522 		memset(mtod(m, void *), 0, todo);
523 		m = m->m_next;
524 		len -= todo;
525 	}
526 }
527 
528 static void
529 handle_get_log_page(struct nvmft_controller *ctrlr,
530     struct nvmf_capsule *nc, const struct nvme_command *cmd)
531 {
532 	struct mbuf *m;
533 	uint64_t offset;
534 	uint32_t numd;
535 	size_t len, todo;
536 	u_int status;
537 	uint8_t lid;
538 	bool rae;
539 
540 	lid = le32toh(cmd->cdw10) & 0xff;
541 	rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
542 	numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
543 	offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
544 
545 	if (offset % 3 != 0) {
546 		status = NVME_SC_INVALID_FIELD;
547 		goto done;
548 	}
549 
550 	len = (numd + 1) * 4;
551 
552 	switch (lid) {
553 	case NVME_LOG_ERROR:
554 		todo = 0;
555 
556 		m = m_getml(len, M_WAITOK);
557 		if (todo != len)
558 			m_zero(m, todo, len - todo);
559 		status = nvmf_send_controller_data(nc, 0, m, len);
560 		MPASS(status != NVMF_MORE);
561 		break;
562 	case NVME_LOG_HEALTH_INFORMATION:
563 	{
564 		struct nvme_health_information_page hip;
565 
566 		if (offset >= sizeof(hip)) {
567 			status = NVME_SC_INVALID_FIELD;
568 			goto done;
569 		}
570 		todo = sizeof(hip) - offset;
571 		if (todo > len)
572 			todo = len;
573 
574 		mtx_lock(&ctrlr->lock);
575 		hip = ctrlr->hip;
576 		hip.controller_busy_time[0] =
577 		    sbintime_getsec(ctrlr->busy_total) / 60;
578 		hip.power_on_hours[0] =
579 		    sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
580 		mtx_unlock(&ctrlr->lock);
581 
582 		m = m_getml(len, M_WAITOK);
583 		m_copyback(m, 0, todo, (char *)&hip + offset);
584 		if (todo != len)
585 			m_zero(m, todo, len - todo);
586 		status = nvmf_send_controller_data(nc, 0, m, len);
587 		MPASS(status != NVMF_MORE);
588 		break;
589 	}
590 	case NVME_LOG_FIRMWARE_SLOT:
591 		if (offset >= sizeof(ctrlr->np->fp)) {
592 			status = NVME_SC_INVALID_FIELD;
593 			goto done;
594 		}
595 		todo = sizeof(ctrlr->np->fp) - offset;
596 		if (todo > len)
597 			todo = len;
598 
599 		m = m_getml(len, M_WAITOK);
600 		m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
601 		if (todo != len)
602 			m_zero(m, todo, len - todo);
603 		status = nvmf_send_controller_data(nc, 0, m, len);
604 		MPASS(status != NVMF_MORE);
605 		break;
606 	case NVME_LOG_CHANGED_NAMESPACE:
607 		if (offset >= sizeof(*ctrlr->changed_ns)) {
608 			status = NVME_SC_INVALID_FIELD;
609 			goto done;
610 		}
611 		todo = sizeof(*ctrlr->changed_ns) - offset;
612 		if (todo > len)
613 			todo = len;
614 
615 		m = m_getml(len, M_WAITOK);
616 		mtx_lock(&ctrlr->lock);
617 		m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
618 		if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
619 			memset(ctrlr->changed_ns, 0,
620 			    sizeof(*ctrlr->changed_ns));
621 		if (!rae)
622 			ctrlr->changed_ns_reported = false;
623 		mtx_unlock(&ctrlr->lock);
624 		if (todo != len)
625 			m_zero(m, todo, len - todo);
626 		status = nvmf_send_controller_data(nc, 0, m, len);
627 		MPASS(status != NVMF_MORE);
628 		break;
629 	default:
630 		nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
631 		    lid);
632 		status = NVME_SC_INVALID_FIELD;
633 		break;
634 	}
635 
636 done:
637 	if (status == NVMF_SUCCESS_SENT)
638 		nvmft_command_completed(ctrlr->admin, nc);
639 	else
640 		nvmft_send_generic_error(ctrlr->admin, nc, status);
641 	nvmf_free_capsule(nc);
642 }
643 
644 static void
645 m_free_nslist(struct mbuf *m)
646 {
647 	free(m->m_ext.ext_arg1, M_NVMFT);
648 }
649 
650 static void
651 handle_identify_command(struct nvmft_controller *ctrlr,
652     struct nvmf_capsule *nc, const struct nvme_command *cmd)
653 {
654 	struct mbuf *m;
655 	size_t data_len;
656 	u_int status;
657 	uint8_t cns;
658 
659 	cns = le32toh(cmd->cdw10) & 0xFF;
660 	data_len = nvmf_capsule_data_len(nc);
661 	if (data_len != sizeof(ctrlr->cdata)) {
662 		nvmft_printf(ctrlr,
663 		    "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
664 		    cns);
665 		nvmft_send_generic_error(ctrlr->admin, nc,
666 		    NVME_SC_INVALID_OPCODE);
667 		nvmf_free_capsule(nc);
668 		return;
669 	}
670 
671 	switch (cns) {
672 	case 0:	/* Namespace data. */
673 	case 3:	/* Namespace Identification Descriptor list. */
674 		nvmft_dispatch_command(ctrlr->admin, nc, true);
675 		return;
676 	case 1:
677 		/* Controller data. */
678 		m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
679 		m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
680 		status = nvmf_send_controller_data(nc, 0, m,
681 		    sizeof(ctrlr->cdata));
682 		MPASS(status != NVMF_MORE);
683 		break;
684 	case 2:
685 	{
686 		/* Active namespace list. */
687 		struct nvme_ns_list *nslist;
688 		uint32_t nsid;
689 
690 		nsid = le32toh(cmd->nsid);
691 		if (nsid >= 0xfffffffe) {
692 			status = NVME_SC_INVALID_FIELD;
693 			break;
694 		}
695 
696 		nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
697 		nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
698 		m = m_get(M_WAITOK, MT_DATA);
699 		m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
700 		    nslist, NULL, 0, EXT_CTL);
701 		m->m_len = sizeof(*nslist);
702 		status = nvmf_send_controller_data(nc, 0, m, m->m_len);
703 		MPASS(status != NVMF_MORE);
704 		break;
705 	}
706 	default:
707 		nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
708 		status = NVME_SC_INVALID_FIELD;
709 		break;
710 	}
711 
712 	if (status == NVMF_SUCCESS_SENT)
713 		nvmft_command_completed(ctrlr->admin, nc);
714 	else
715 		nvmft_send_generic_error(ctrlr->admin, nc, status);
716 	nvmf_free_capsule(nc);
717 }
718 
719 static void
720 handle_set_features(struct nvmft_controller *ctrlr,
721     struct nvmf_capsule *nc, const struct nvme_command *cmd)
722 {
723 	struct nvme_completion cqe;
724 	uint8_t fid;
725 
726 	fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
727 	switch (fid) {
728 	case NVME_FEAT_NUMBER_OF_QUEUES:
729 	{
730 		uint32_t num_queues;
731 		struct nvmft_io_qpair *io_qpairs;
732 
733 		num_queues = le32toh(cmd->cdw11) & 0xffff;
734 
735 		/* 5.12.1.7: 65535 is invalid. */
736 		if (num_queues == 65535)
737 			goto error;
738 
739 		/* Fabrics requires the same number of SQs and CQs. */
740 		if (le32toh(cmd->cdw11) >> 16 != num_queues)
741 			goto error;
742 
743 		/* Convert to 1's based */
744 		num_queues++;
745 
746 		io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
747 		    M_NVMFT, M_WAITOK | M_ZERO);
748 
749 		mtx_lock(&ctrlr->lock);
750 		if (ctrlr->num_io_queues != 0) {
751 			mtx_unlock(&ctrlr->lock);
752 			free(io_qpairs, M_NVMFT);
753 			nvmft_send_generic_error(ctrlr->admin, nc,
754 			    NVME_SC_COMMAND_SEQUENCE_ERROR);
755 			nvmf_free_capsule(nc);
756 			return;
757 		}
758 
759 		ctrlr->num_io_queues = num_queues;
760 		ctrlr->io_qpairs = io_qpairs;
761 		mtx_unlock(&ctrlr->lock);
762 
763 		nvmft_init_cqe(&cqe, nc, 0);
764 		cqe.cdw0 = cmd->cdw11;
765 		nvmft_send_response(ctrlr->admin, &cqe);
766 		nvmf_free_capsule(nc);
767 		return;
768 	}
769 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
770 	{
771 		uint32_t aer_mask;
772 
773 		aer_mask = le32toh(cmd->cdw11);
774 
775 		/* Check for any reserved or unimplemented feature bits. */
776 		if ((aer_mask & 0xffffc000) != 0)
777 			goto error;
778 
779 		mtx_lock(&ctrlr->lock);
780 		ctrlr->aer_mask = aer_mask;
781 		mtx_unlock(&ctrlr->lock);
782 		nvmft_send_success(ctrlr->admin, nc);
783 		return;
784 	}
785 	default:
786 		nvmft_printf(ctrlr,
787 		    "Unsupported feature ID %u for SET_FEATURES\n", fid);
788 		goto error;
789 	}
790 
791 error:
792 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
793 	nvmf_free_capsule(nc);
794 }
795 
796 static bool
797 update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
798 {
799 	struct nvmft_port *np = ctrlr->np;
800 	uint32_t changes;
801 
802 	*need_shutdown = false;
803 
804 	mtx_lock(&ctrlr->lock);
805 
806 	/* Don't allow any changes while shutting down. */
807 	if (ctrlr->shutdown) {
808 		mtx_unlock(&ctrlr->lock);
809 		return (false);
810 	}
811 
812 	if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
813 		mtx_unlock(&ctrlr->lock);
814 		return (false);
815 	}
816 
817 	changes = ctrlr->cc ^ new_cc;
818 	ctrlr->cc = new_cc;
819 
820 	/* Handle shutdown requests. */
821 	if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
822 	    NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
823 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
824 		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
825 		ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
826 		ctrlr->shutdown = true;
827 		*need_shutdown = true;
828 		nvmft_printf(ctrlr, "shutdown requested\n");
829 	}
830 
831 	if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
832 		if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
833 			/* Controller reset. */
834 			nvmft_printf(ctrlr, "reset requested\n");
835 			ctrlr->shutdown = true;
836 			*need_shutdown = true;
837 		} else
838 			ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
839 	}
840 	mtx_unlock(&ctrlr->lock);
841 
842 	return (true);
843 }
844 
845 static void
846 handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
847     const struct nvmf_fabric_prop_get_cmd *pget)
848 {
849 	struct nvmf_fabric_prop_get_rsp rsp;
850 
851 	nvmft_init_cqe(&rsp, nc, 0);
852 
853 	switch (le32toh(pget->ofst)) {
854 	case NVMF_PROP_CAP:
855 		if (pget->attrib.size != NVMF_PROP_SIZE_8)
856 			goto error;
857 		rsp.value.u64 = htole64(ctrlr->np->cap);
858 		break;
859 	case NVMF_PROP_VS:
860 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
861 			goto error;
862 		rsp.value.u32.low = ctrlr->cdata.ver;
863 		break;
864 	case NVMF_PROP_CC:
865 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
866 			goto error;
867 		rsp.value.u32.low = htole32(ctrlr->cc);
868 		break;
869 	case NVMF_PROP_CSTS:
870 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
871 			goto error;
872 		rsp.value.u32.low = htole32(ctrlr->csts);
873 		break;
874 	default:
875 		goto error;
876 	}
877 
878 	nvmft_send_response(ctrlr->admin, &rsp);
879 	return;
880 error:
881 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
882 }
883 
884 static void
885 handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
886     const struct nvmf_fabric_prop_set_cmd *pset)
887 {
888 	bool need_shutdown;
889 
890 	need_shutdown = false;
891 	switch (le32toh(pset->ofst)) {
892 	case NVMF_PROP_CC:
893 		if (pset->attrib.size != NVMF_PROP_SIZE_4)
894 			goto error;
895 		if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
896 		    &need_shutdown))
897 			goto error;
898 		break;
899 	default:
900 		goto error;
901 	}
902 
903 	nvmft_send_success(ctrlr->admin, nc);
904 	if (need_shutdown) {
905 		callout_stop(&ctrlr->ka_timer);
906 		taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
907 	}
908 	return;
909 error:
910 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
911 }
912 
913 static void
914 handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
915     struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
916 {
917 	switch (fc->fctype) {
918 	case NVMF_FABRIC_COMMAND_PROPERTY_GET:
919 		handle_property_get(ctrlr, nc,
920 		    (const struct nvmf_fabric_prop_get_cmd *)fc);
921 		break;
922 	case NVMF_FABRIC_COMMAND_PROPERTY_SET:
923 		handle_property_set(ctrlr, nc,
924 		    (const struct nvmf_fabric_prop_set_cmd *)fc);
925 		break;
926 	case NVMF_FABRIC_COMMAND_CONNECT:
927 		nvmft_printf(ctrlr,
928 		    "CONNECT command on connected admin queue\n");
929 		nvmft_send_generic_error(ctrlr->admin, nc,
930 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
931 		break;
932 	case NVMF_FABRIC_COMMAND_DISCONNECT:
933 		nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
934 		nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
935 		    NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
936 		break;
937 	default:
938 		nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
939 		    fc->fctype);
940 		nvmft_send_generic_error(ctrlr->admin, nc,
941 		    NVME_SC_INVALID_OPCODE);
942 		break;
943 	}
944 	nvmf_free_capsule(nc);
945 }
946 
947 void
948 nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
949     struct nvmf_capsule *nc)
950 {
951 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
952 
953 	/* Only permit Fabrics commands while a controller is disabled. */
954 	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
955 	    cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
956 		nvmft_printf(ctrlr,
957 		    "Unsupported admin opcode %#x while disabled\n", cmd->opc);
958 		nvmft_send_generic_error(ctrlr->admin, nc,
959 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
960 		nvmf_free_capsule(nc);
961 		return;
962 	}
963 
964 	atomic_store_int(&ctrlr->ka_active_traffic, 1);
965 
966 	switch (cmd->opc) {
967 	case NVME_OPC_GET_LOG_PAGE:
968 		handle_get_log_page(ctrlr, nc, cmd);
969 		break;
970 	case NVME_OPC_IDENTIFY:
971 		handle_identify_command(ctrlr, nc, cmd);
972 		break;
973 	case NVME_OPC_SET_FEATURES:
974 		handle_set_features(ctrlr, nc, cmd);
975 		break;
976 	case NVME_OPC_ASYNC_EVENT_REQUEST:
977 		mtx_lock(&ctrlr->lock);
978 		if (ctrlr->aer_pending == NVMFT_NUM_AER) {
979 			mtx_unlock(&ctrlr->lock);
980 			nvmft_send_error(ctrlr->admin, nc,
981 			    NVME_SCT_COMMAND_SPECIFIC,
982 			    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
983 		} else {
984 			/* NB: Store the CID without byte-swapping. */
985 			ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
986 			ctrlr->aer_pending++;
987 			ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
988 			mtx_unlock(&ctrlr->lock);
989 		}
990 		nvmf_free_capsule(nc);
991 		break;
992 	case NVME_OPC_KEEP_ALIVE:
993 		nvmft_send_success(ctrlr->admin, nc);
994 		nvmf_free_capsule(nc);
995 		break;
996 	case NVME_OPC_FABRICS_COMMANDS:
997 		handle_admin_fabrics_command(ctrlr, nc,
998 		    (const struct nvmf_fabric_cmd *)cmd);
999 		break;
1000 	default:
1001 		nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
1002 		nvmft_send_generic_error(ctrlr->admin, nc,
1003 		    NVME_SC_INVALID_OPCODE);
1004 		nvmf_free_capsule(nc);
1005 		break;
1006 	}
1007 }
1008 
1009 void
1010 nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
1011     struct nvmf_capsule *nc)
1012 {
1013 	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
1014 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
1015 
1016 	atomic_store_int(&ctrlr->ka_active_traffic, 1);
1017 
1018 	switch (cmd->opc) {
1019 	case NVME_OPC_FLUSH:
1020 		if (cmd->nsid == htole32(0xffffffff)) {
1021 			nvmft_send_generic_error(qp, nc,
1022 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1023 			nvmf_free_capsule(nc);
1024 			break;
1025 		}
1026 		/* FALLTHROUGH */
1027 	case NVME_OPC_WRITE:
1028 	case NVME_OPC_READ:
1029 	case NVME_OPC_WRITE_UNCORRECTABLE:
1030 	case NVME_OPC_COMPARE:
1031 	case NVME_OPC_WRITE_ZEROES:
1032 	case NVME_OPC_DATASET_MANAGEMENT:
1033 	case NVME_OPC_VERIFY:
1034 		nvmft_dispatch_command(qp, nc, false);
1035 		break;
1036 	default:
1037 		nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
1038 		nvmft_send_generic_error(qp, nc,
1039 		    NVME_SC_INVALID_OPCODE);
1040 		nvmf_free_capsule(nc);
1041 		break;
1042 	}
1043 }
1044 
1045 static void
1046 nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
1047     u_int type, uint8_t info, uint8_t log_page_id)
1048 {
1049 	struct nvme_completion cpl;
1050 
1051 	MPASS(type <= 7);
1052 
1053 	/* Drop events that are not enabled. */
1054 	mtx_lock(&ctrlr->lock);
1055 	if ((ctrlr->aer_mask & aer_mask) == 0) {
1056 		mtx_unlock(&ctrlr->lock);
1057 		return;
1058 	}
1059 
1060 	/*
1061 	 * If there is no pending AER command, drop it.
1062 	 * XXX: Should we queue these?
1063 	 */
1064 	if (ctrlr->aer_pending == 0) {
1065 		mtx_unlock(&ctrlr->lock);
1066 		nvmft_printf(ctrlr,
1067 		    "dropping AER type %u, info %#x, page %#x\n",
1068 		    type, info, log_page_id);
1069 		return;
1070 	}
1071 
1072 	memset(&cpl, 0, sizeof(cpl));
1073 	cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
1074 	ctrlr->aer_pending--;
1075 	ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
1076 	mtx_unlock(&ctrlr->lock);
1077 
1078 	cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
1079 	    NVMEF(NVME_ASYNC_EVENT_INFO, info) |
1080 	    NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
1081 
1082 	nvmft_send_response(ctrlr->admin, &cpl);
1083 }
1084 
1085 void
1086 nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
1087 {
1088 	struct nvme_ns_list *nslist;
1089 	uint32_t new_nsid, nsid;
1090 	u_int i;
1091 
1092 	new_nsid = lun_id + 1;
1093 
1094 	mtx_lock(&ctrlr->lock);
1095 	nslist = ctrlr->changed_ns;
1096 
1097 	/* If the first entry is 0xffffffff, the list is already full. */
1098 	if (nslist->ns[0] != 0xffffffff) {
1099 		/* Find the insertion point for this namespace ID. */
1100 		for (i = 0; i < nitems(nslist->ns); i++) {
1101 			nsid = le32toh(nslist->ns[i]);
1102 			if (nsid == new_nsid) {
1103 				/* Already reported, nothing to do. */
1104 				mtx_unlock(&ctrlr->lock);
1105 				return;
1106 			}
1107 
1108 			if (nsid == 0 || nsid > new_nsid)
1109 				break;
1110 		}
1111 
1112 		if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
1113 			/* List is full. */
1114 			memset(ctrlr->changed_ns, 0,
1115 			    sizeof(*ctrlr->changed_ns));
1116 			ctrlr->changed_ns->ns[0] = 0xffffffff;
1117 		} else if (nslist->ns[i] == htole32(0)) {
1118 			/*
1119 			 * Optimize case where this ID is appended to
1120 			 * the end.
1121 			 */
1122 			nslist->ns[i] = htole32(new_nsid);
1123 		} else {
1124 			memmove(&nslist->ns[i + 1], &nslist->ns[i],
1125 			    (nitems(nslist->ns) - i - 1) *
1126 			    sizeof(nslist->ns[0]));
1127 			nslist->ns[i] = htole32(new_nsid);
1128 		}
1129 	}
1130 
1131 	if (ctrlr->changed_ns_reported) {
1132 		mtx_unlock(&ctrlr->lock);
1133 		return;
1134 	}
1135 	ctrlr->changed_ns_reported = true;
1136 	mtx_unlock(&ctrlr->lock);
1137 
1138 	nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
1139 	    NVME_LOG_CHANGED_NAMESPACE);
1140 }
1141