xref: /freebsd/sys/dev/nvmf/controller/nvmft_controller.c (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/callout.h>
10 #include <sys/kernel.h>
11 #include <sys/lock.h>
12 #include <sys/malloc.h>
13 #include <sys/mbuf.h>
14 #include <sys/memdesc.h>
15 #include <sys/mutex.h>
16 #include <sys/sbuf.h>
17 #include <sys/sx.h>
18 #include <sys/taskqueue.h>
19 
20 #include <dev/nvmf/nvmf_transport.h>
21 #include <dev/nvmf/controller/nvmft_subr.h>
22 #include <dev/nvmf/controller/nvmft_var.h>
23 
24 static void	nvmft_controller_shutdown(void *arg, int pending);
25 static void	nvmft_controller_terminate(void *arg, int pending);
26 
27 int
28 nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
29 {
30 	char buf[128];
31 	struct sbuf sb;
32 	va_list ap;
33 	size_t retval;
34 
35 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
36 	sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
37 
38 	sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
39 
40 	va_start(ap, fmt);
41 	sbuf_vprintf(&sb, fmt, ap);
42 	va_end(ap);
43 
44 	sbuf_finish(&sb);
45 	sbuf_delete(&sb);
46 
47 	return (retval);
48 }
49 
50 static struct nvmft_controller *
51 nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
52     const struct nvmf_fabric_connect_data *data)
53 {
54 	struct nvmft_controller *ctrlr;
55 
56 	ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
57 	ctrlr->cntlid = cntlid;
58 	nvmft_port_ref(np);
59 	TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
60 	ctrlr->np = np;
61 	mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
62 	callout_init(&ctrlr->ka_timer, 1);
63 	TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
64 	TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
65 	    nvmft_controller_terminate, ctrlr);
66 
67 	ctrlr->cdata = np->cdata;
68 	ctrlr->cdata.ctrlr_id = htole16(cntlid);
69 	memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
70 	memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
71 	ctrlr->hip.power_cycles[0] = 1;
72 	ctrlr->create_time = sbinuptime();
73 
74 	ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
75 	    M_WAITOK | M_ZERO);
76 
77 	return (ctrlr);
78 }
79 
80 static void
81 nvmft_controller_free(struct nvmft_controller *ctrlr)
82 {
83 	mtx_destroy(&ctrlr->lock);
84 	MPASS(ctrlr->io_qpairs == NULL);
85 	free(ctrlr->changed_ns, M_NVMFT);
86 	free(ctrlr, M_NVMFT);
87 }
88 
89 static void
90 nvmft_keep_alive_timer(void *arg)
91 {
92 	struct nvmft_controller *ctrlr = arg;
93 	int traffic;
94 
95 	if (ctrlr->shutdown)
96 		return;
97 
98 	traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
99 	if (traffic == 0) {
100 		nvmft_printf(ctrlr,
101 		    "disconnecting due to KeepAlive timeout\n");
102 		nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
103 		return;
104 	}
105 
106 	callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
107 }
108 
109 int
110 nvmft_handoff_admin_queue(struct nvmft_port *np, enum nvmf_trtype trtype,
111     const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd,
112     const struct nvmf_fabric_connect_data *data)
113 {
114 	struct nvmft_controller *ctrlr;
115 	struct nvmft_qpair *qp;
116 	uint32_t kato;
117 	int cntlid;
118 
119 	if (cmd->qid != htole16(0))
120 		return (EINVAL);
121 
122 	qp = nvmft_qpair_init(trtype, params, 0, "admin queue");
123 	if (qp == NULL) {
124 		printf("NVMFT: Failed to setup admin queue from %.*s\n",
125 		    (int)sizeof(data->hostnqn), data->hostnqn);
126 		return (ENXIO);
127 	}
128 
129 	sx_xlock(&np->lock);
130 	cntlid = alloc_unr(np->ids);
131 	if (cntlid == -1) {
132 		sx_xunlock(&np->lock);
133 		printf("NVMFT: Unable to allocate controller for %.*s\n",
134 		    (int)sizeof(data->hostnqn), data->hostnqn);
135 		nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
136 		    NVMF_FABRIC_SC_INVALID_HOST);
137 		nvmft_qpair_destroy(qp);
138 		return (ENOMEM);
139 	}
140 
141 #ifdef INVARIANTS
142 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
143 		KASSERT(ctrlr->cntlid != cntlid,
144 		    ("%s: duplicate controllers with id %d", __func__, cntlid));
145 	}
146 #endif
147 
148 	ctrlr = nvmft_controller_alloc(np, cntlid, data);
149 	nvmft_printf(ctrlr, "associated with %.*s\n",
150 	    (int)sizeof(data->hostnqn), data->hostnqn);
151 	ctrlr->admin = qp;
152 	ctrlr->trtype = trtype;
153 
154 	/*
155 	 * The spec requires a non-zero KeepAlive timer, but allow a
156 	 * zero KATO value to match Linux.
157 	 */
158 	kato = le32toh(cmd->kato);
159 	if (kato != 0) {
160 		/*
161 		 * Round up to 1 second matching granularity
162 		 * advertised in cdata.
163 		 */
164 		ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
165 		callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
166 		    nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
167 	}
168 
169 	nvmft_finish_accept(qp, cmd, ctrlr);
170 	sx_xunlock(&np->lock);
171 
172 	return (0);
173 }
174 
175 int
176 nvmft_handoff_io_queue(struct nvmft_port *np, enum nvmf_trtype trtype,
177     const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd,
178     const struct nvmf_fabric_connect_data *data)
179 {
180 	struct nvmft_controller *ctrlr;
181 	struct nvmft_qpair *qp;
182 	char name[16];
183 	uint16_t cntlid, qid;
184 
185 	qid = le16toh(cmd->qid);
186 	if (qid == 0)
187 		return (EINVAL);
188 	cntlid = le16toh(data->cntlid);
189 
190 	snprintf(name, sizeof(name), "I/O queue %u", qid);
191 	qp = nvmft_qpair_init(trtype, params, qid, name);
192 	if (qp == NULL) {
193 		printf("NVMFT: Failed to setup I/O queue %u from %.*s\n", qid,
194 		    (int)sizeof(data->hostnqn), data->hostnqn);
195 		return (ENXIO);
196 	}
197 
198 	sx_slock(&np->lock);
199 	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
200 		if (ctrlr->cntlid == cntlid)
201 			break;
202 	}
203 	if (ctrlr == NULL) {
204 		sx_sunlock(&np->lock);
205 		printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
206 		    ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
207 		    data->hostnqn);
208 		nvmft_connect_invalid_parameters(qp, cmd, true,
209 		    offsetof(struct nvmf_fabric_connect_data, cntlid));
210 		nvmft_qpair_destroy(qp);
211 		return (ENOENT);
212 	}
213 
214 	if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
215 		sx_sunlock(&np->lock);
216 		nvmft_printf(ctrlr,
217 		    "hostid mismatch for I/O queue %u from %.*s\n", qid,
218 		    (int)sizeof(data->hostnqn), data->hostnqn);
219 		nvmft_connect_invalid_parameters(qp, cmd, true,
220 		    offsetof(struct nvmf_fabric_connect_data, hostid));
221 		nvmft_qpair_destroy(qp);
222 		return (EINVAL);
223 	}
224 	if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
225 		sx_sunlock(&np->lock);
226 		nvmft_printf(ctrlr,
227 		    "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
228 		    (int)sizeof(data->hostnqn), data->hostnqn);
229 		nvmft_connect_invalid_parameters(qp, cmd, true,
230 		    offsetof(struct nvmf_fabric_connect_data, hostnqn));
231 		nvmft_qpair_destroy(qp);
232 		return (EINVAL);
233 	}
234 
235 	/* XXX: Require trtype == ctrlr->trtype? */
236 
237 	mtx_lock(&ctrlr->lock);
238 	if (ctrlr->shutdown) {
239 		mtx_unlock(&ctrlr->lock);
240 		sx_sunlock(&np->lock);
241 		nvmft_printf(ctrlr,
242 		    "attempt to create I/O queue %u on disabled controller from %.*s\n",
243 		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
244 		nvmft_connect_invalid_parameters(qp, cmd, true,
245 		    offsetof(struct nvmf_fabric_connect_data, cntlid));
246 		nvmft_qpair_destroy(qp);
247 		return (EINVAL);
248 	}
249 	if (ctrlr->num_io_queues == 0) {
250 		mtx_unlock(&ctrlr->lock);
251 		sx_sunlock(&np->lock);
252 		nvmft_printf(ctrlr,
253 		    "attempt to create I/O queue %u without enabled queues from %.*s\n",
254 		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
255 		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
256 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
257 		nvmft_qpair_destroy(qp);
258 		return (EINVAL);
259 	}
260 	if (cmd->qid > ctrlr->num_io_queues) {
261 		mtx_unlock(&ctrlr->lock);
262 		sx_sunlock(&np->lock);
263 		nvmft_printf(ctrlr,
264 		    "attempt to create invalid I/O queue %u from %.*s\n", qid,
265 		    (int)sizeof(data->hostnqn), data->hostnqn);
266 		nvmft_connect_invalid_parameters(qp, cmd, false,
267 		    offsetof(struct nvmf_fabric_connect_cmd, qid));
268 		nvmft_qpair_destroy(qp);
269 		return (EINVAL);
270 	}
271 	if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
272 		mtx_unlock(&ctrlr->lock);
273 		sx_sunlock(&np->lock);
274 		nvmft_printf(ctrlr,
275 		    "attempt to re-create I/O queue %u from %.*s\n", qid,
276 		    (int)sizeof(data->hostnqn), data->hostnqn);
277 		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
278 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
279 		nvmft_qpair_destroy(qp);
280 		return (EINVAL);
281 	}
282 
283 	ctrlr->io_qpairs[qid - 1].qp = qp;
284 	mtx_unlock(&ctrlr->lock);
285 	nvmft_finish_accept(qp, cmd, ctrlr);
286 	sx_sunlock(&np->lock);
287 
288 	return (0);
289 }
290 
291 static void
292 nvmft_controller_shutdown(void *arg, int pending)
293 {
294 	struct nvmft_controller *ctrlr = arg;
295 
296 	MPASS(pending == 1);
297 
298 	/*
299 	 * Shutdown all I/O queues to terminate pending datamoves and
300 	 * stop receiving new commands.
301 	 */
302 	mtx_lock(&ctrlr->lock);
303 	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
304 		if (ctrlr->io_qpairs[i].qp != NULL) {
305 			ctrlr->io_qpairs[i].shutdown = true;
306 			mtx_unlock(&ctrlr->lock);
307 			nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
308 			mtx_lock(&ctrlr->lock);
309 		}
310 	}
311 	mtx_unlock(&ctrlr->lock);
312 
313 	/* Terminate active CTL commands. */
314 	nvmft_terminate_commands(ctrlr);
315 
316 	/* Wait for all pending CTL commands to complete. */
317 	mtx_lock(&ctrlr->lock);
318 	while (ctrlr->pending_commands != 0)
319 		mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
320 		    hz / 100);
321 	mtx_unlock(&ctrlr->lock);
322 
323 	/* Delete all of the I/O queues. */
324 	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
325 		if (ctrlr->io_qpairs[i].qp != NULL)
326 			nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
327 	}
328 	free(ctrlr->io_qpairs, M_NVMFT);
329 	ctrlr->io_qpairs = NULL;
330 
331 	mtx_lock(&ctrlr->lock);
332 	ctrlr->num_io_queues = 0;
333 
334 	/* Mark shutdown complete. */
335 	if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
336 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
337 		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
338 	}
339 
340 	if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
341 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
342 		ctrlr->shutdown = false;
343 	}
344 	mtx_unlock(&ctrlr->lock);
345 
346 	/*
347 	 * If the admin queue was closed while shutting down or a
348 	 * fatal controller error has occurred, terminate the
349 	 * association immediately, otherwise wait up to 2 minutes
350 	 * (NVMe-over-Fabrics 1.1 4.6).
351 	 */
352 	if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
353 		nvmft_controller_terminate(ctrlr, 0);
354 	else
355 		taskqueue_enqueue_timeout(taskqueue_thread,
356 		    &ctrlr->terminate_task, hz * 60 * 2);
357 }
358 
359 static void
360 nvmft_controller_terminate(void *arg, int pending)
361 {
362 	struct nvmft_controller *ctrlr = arg;
363 	struct nvmft_port *np;
364 	bool wakeup_np;
365 
366 	/* If the controller has been re-enabled, nothing to do. */
367 	mtx_lock(&ctrlr->lock);
368 	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
369 		mtx_unlock(&ctrlr->lock);
370 
371 		if (ctrlr->ka_sbt != 0)
372 			callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
373 			    C_HARDCLOCK);
374 		return;
375 	}
376 
377 	/* Disable updates to CC while destroying admin qpair. */
378 	ctrlr->shutdown = true;
379 	mtx_unlock(&ctrlr->lock);
380 
381 	nvmft_qpair_destroy(ctrlr->admin);
382 
383 	/* Remove association (CNTLID). */
384 	np = ctrlr->np;
385 	sx_xlock(&np->lock);
386 	TAILQ_REMOVE(&np->controllers, ctrlr, link);
387 	free_unr(np->ids, ctrlr->cntlid);
388 	wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
389 	sx_xunlock(&np->lock);
390 	if (wakeup_np)
391 		wakeup(np);
392 
393 	callout_drain(&ctrlr->ka_timer);
394 
395 	nvmft_printf(ctrlr, "association terminated\n");
396 	nvmft_controller_free(ctrlr);
397 	nvmft_port_rele(np);
398 }
399 
400 void
401 nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
402     int error)
403 {
404 	/*
405 	 * If a queue pair is closed, that isn't an error per se.
406 	 * That just means additional commands cannot be received on
407 	 * that queue pair.
408 	 *
409 	 * If the admin queue pair is closed while idle or while
410 	 * shutting down, terminate the association immediately.
411 	 *
412 	 * If an I/O queue pair is closed, just ignore it.
413 	 */
414 	if (error == 0) {
415 		if (qp != ctrlr->admin)
416 			return;
417 
418 		mtx_lock(&ctrlr->lock);
419 		if (ctrlr->shutdown) {
420 			ctrlr->admin_closed = true;
421 			mtx_unlock(&ctrlr->lock);
422 			return;
423 		}
424 
425 		if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
426 			MPASS(ctrlr->num_io_queues == 0);
427 			mtx_unlock(&ctrlr->lock);
428 
429 			/*
430 			 * Ok to drop lock here since ctrlr->cc can't
431 			 * change if the admin queue pair has closed.
432 			 * This also means no new queues can be handed
433 			 * off, etc.  Note that since there are no I/O
434 			 * queues, only the admin queue needs to be
435 			 * destroyed, so it is safe to skip
436 			 * nvmft_controller_shutdown and just schedule
437 			 * nvmft_controller_terminate.  Note that we
438 			 * cannot call nvmft_controller_terminate from
439 			 * here directly as this is called from the
440 			 * transport layer and freeing the admin qpair
441 			 * might deadlock waiting for the current
442 			 * thread to exit.
443 			 */
444 			if (taskqueue_cancel_timeout(taskqueue_thread,
445 			    &ctrlr->terminate_task, NULL) == 0)
446 				taskqueue_enqueue_timeout(taskqueue_thread,
447 				    &ctrlr->terminate_task, 0);
448 			return;
449 		}
450 
451 		/*
452 		 * Treat closing of the admin queue pair while enabled
453 		 * as a transport error.  Note that the admin queue
454 		 * pair has been closed.
455 		 */
456 		ctrlr->admin_closed = true;
457 	} else
458 		mtx_lock(&ctrlr->lock);
459 
460 	/* Ignore transport errors if we are already shutting down. */
461 	if (ctrlr->shutdown) {
462 		mtx_unlock(&ctrlr->lock);
463 		return;
464 	}
465 
466 	ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
467 	ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
468 	ctrlr->shutdown = true;
469 	mtx_unlock(&ctrlr->lock);
470 
471 	callout_stop(&ctrlr->ka_timer);
472 	taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
473 }
474 
475 /* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
476 static struct mbuf *
477 m_getml(size_t len, int how)
478 {
479 	struct mbuf *m, *n;
480 
481 	m = m_getm2(NULL, len, how, MT_DATA, 0);
482 	if (m == NULL)
483 		return (NULL);
484 	for (n = m; len > 0; n = n->m_next) {
485 		n->m_len = M_SIZE(n);
486 		if (n->m_len >= len) {
487 			n->m_len = len;
488 			MPASS(n->m_next == NULL);
489 		}
490 		len -= n->m_len;
491 	}
492 	return (m);
493 }
494 
495 static void
496 m_zero(struct mbuf *m, u_int offset, u_int len)
497 {
498 	u_int todo;
499 
500 	if (len == 0)
501 		return;
502 
503 	while (m->m_len <= offset) {
504 		offset -= m->m_len;
505 		m = m->m_next;
506 	}
507 
508 	todo = m->m_len - offset;
509 	if (todo > len)
510 		todo = len;
511 	memset(mtodo(m, offset), 0, todo);
512 	m = m->m_next;
513 	len -= todo;
514 
515 	while (len > 0) {
516 		todo = m->m_len;
517 		if (todo > len)
518 			todo = len;
519 		memset(mtod(m, void *), 0, todo);
520 		m = m->m_next;
521 		len -= todo;
522 	}
523 }
524 
525 static void
526 handle_get_log_page(struct nvmft_controller *ctrlr,
527     struct nvmf_capsule *nc, const struct nvme_command *cmd)
528 {
529 	struct mbuf *m;
530 	uint64_t offset;
531 	uint32_t numd;
532 	size_t len, todo;
533 	u_int status;
534 	uint8_t lid;
535 	bool rae;
536 
537 	lid = le32toh(cmd->cdw10) & 0xff;
538 	rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
539 	numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
540 	offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
541 
542 	if (offset % 3 != 0) {
543 		status = NVME_SC_INVALID_FIELD;
544 		goto done;
545 	}
546 
547 	len = (numd + 1) * 4;
548 
549 	switch (lid) {
550 	case NVME_LOG_ERROR:
551 		todo = 0;
552 
553 		m = m_getml(len, M_WAITOK);
554 		if (todo != len)
555 			m_zero(m, todo, len - todo);
556 		status = nvmf_send_controller_data(nc, 0, m, len);
557 		MPASS(status != NVMF_MORE);
558 		break;
559 	case NVME_LOG_HEALTH_INFORMATION:
560 	{
561 		struct nvme_health_information_page hip;
562 
563 		if (offset >= sizeof(hip)) {
564 			status = NVME_SC_INVALID_FIELD;
565 			goto done;
566 		}
567 		todo = sizeof(hip) - offset;
568 		if (todo > len)
569 			todo = len;
570 
571 		mtx_lock(&ctrlr->lock);
572 		hip = ctrlr->hip;
573 		hip.controller_busy_time[0] =
574 		    sbintime_getsec(ctrlr->busy_total) / 60;
575 		hip.power_on_hours[0] =
576 		    sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
577 		mtx_unlock(&ctrlr->lock);
578 
579 		m = m_getml(len, M_WAITOK);
580 		m_copyback(m, 0, todo, (char *)&hip + offset);
581 		if (todo != len)
582 			m_zero(m, todo, len - todo);
583 		status = nvmf_send_controller_data(nc, 0, m, len);
584 		MPASS(status != NVMF_MORE);
585 		break;
586 	}
587 	case NVME_LOG_FIRMWARE_SLOT:
588 		if (offset >= sizeof(ctrlr->np->fp)) {
589 			status = NVME_SC_INVALID_FIELD;
590 			goto done;
591 		}
592 		todo = sizeof(ctrlr->np->fp) - offset;
593 		if (todo > len)
594 			todo = len;
595 
596 		m = m_getml(len, M_WAITOK);
597 		m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
598 		if (todo != len)
599 			m_zero(m, todo, len - todo);
600 		status = nvmf_send_controller_data(nc, 0, m, len);
601 		MPASS(status != NVMF_MORE);
602 		break;
603 	case NVME_LOG_CHANGED_NAMESPACE:
604 		if (offset >= sizeof(*ctrlr->changed_ns)) {
605 			status = NVME_SC_INVALID_FIELD;
606 			goto done;
607 		}
608 		todo = sizeof(*ctrlr->changed_ns) - offset;
609 		if (todo > len)
610 			todo = len;
611 
612 		m = m_getml(len, M_WAITOK);
613 		mtx_lock(&ctrlr->lock);
614 		m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
615 		if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
616 			memset(ctrlr->changed_ns, 0,
617 			    sizeof(*ctrlr->changed_ns));
618 		if (!rae)
619 			ctrlr->changed_ns_reported = false;
620 		mtx_unlock(&ctrlr->lock);
621 		if (todo != len)
622 			m_zero(m, todo, len - todo);
623 		status = nvmf_send_controller_data(nc, 0, m, len);
624 		MPASS(status != NVMF_MORE);
625 		break;
626 	default:
627 		nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
628 		    lid);
629 		status = NVME_SC_INVALID_FIELD;
630 		break;
631 	}
632 
633 done:
634 	if (status == NVMF_SUCCESS_SENT)
635 		nvmft_command_completed(ctrlr->admin, nc);
636 	else
637 		nvmft_send_generic_error(ctrlr->admin, nc, status);
638 	nvmf_free_capsule(nc);
639 }
640 
641 static void
642 m_free_nslist(struct mbuf *m)
643 {
644 	free(m->m_ext.ext_arg1, M_NVMFT);
645 }
646 
647 static void
648 handle_identify_command(struct nvmft_controller *ctrlr,
649     struct nvmf_capsule *nc, const struct nvme_command *cmd)
650 {
651 	struct mbuf *m;
652 	size_t data_len;
653 	u_int status;
654 	uint8_t cns;
655 
656 	cns = le32toh(cmd->cdw10) & 0xFF;
657 	data_len = nvmf_capsule_data_len(nc);
658 	if (data_len != sizeof(ctrlr->cdata)) {
659 		nvmft_printf(ctrlr,
660 		    "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
661 		    cns);
662 		nvmft_send_generic_error(ctrlr->admin, nc,
663 		    NVME_SC_INVALID_OPCODE);
664 		nvmf_free_capsule(nc);
665 		return;
666 	}
667 
668 	switch (cns) {
669 	case 0:	/* Namespace data. */
670 	case 3:	/* Namespace Identification Descriptor list. */
671 		nvmft_dispatch_command(ctrlr->admin, nc, true);
672 		return;
673 	case 1:
674 		/* Controller data. */
675 		m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
676 		m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
677 		status = nvmf_send_controller_data(nc, 0, m,
678 		    sizeof(ctrlr->cdata));
679 		MPASS(status != NVMF_MORE);
680 		break;
681 	case 2:
682 	{
683 		/* Active namespace list. */
684 		struct nvme_ns_list *nslist;
685 		uint32_t nsid;
686 
687 		nsid = le32toh(cmd->nsid);
688 		if (nsid >= 0xfffffffe) {
689 			status = NVME_SC_INVALID_FIELD;
690 			break;
691 		}
692 
693 		nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
694 		nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
695 		m = m_get(M_WAITOK, MT_DATA);
696 		m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
697 		    nslist, NULL, 0, EXT_CTL);
698 		m->m_len = sizeof(*nslist);
699 		status = nvmf_send_controller_data(nc, 0, m, m->m_len);
700 		MPASS(status != NVMF_MORE);
701 		break;
702 	}
703 	default:
704 		nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
705 		status = NVME_SC_INVALID_FIELD;
706 		break;
707 	}
708 
709 	if (status == NVMF_SUCCESS_SENT)
710 		nvmft_command_completed(ctrlr->admin, nc);
711 	else
712 		nvmft_send_generic_error(ctrlr->admin, nc, status);
713 	nvmf_free_capsule(nc);
714 }
715 
716 static void
717 handle_set_features(struct nvmft_controller *ctrlr,
718     struct nvmf_capsule *nc, const struct nvme_command *cmd)
719 {
720 	struct nvme_completion cqe;
721 	uint8_t fid;
722 
723 	fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
724 	switch (fid) {
725 	case NVME_FEAT_NUMBER_OF_QUEUES:
726 	{
727 		uint32_t num_queues;
728 		struct nvmft_io_qpair *io_qpairs;
729 
730 		num_queues = le32toh(cmd->cdw11) & 0xffff;
731 
732 		/* 5.12.1.7: 65535 is invalid. */
733 		if (num_queues == 65535)
734 			goto error;
735 
736 		/* Fabrics requires the same number of SQs and CQs. */
737 		if (le32toh(cmd->cdw11) >> 16 != num_queues)
738 			goto error;
739 
740 		/* Convert to 1's based */
741 		num_queues++;
742 
743 		io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
744 		    M_NVMFT, M_WAITOK | M_ZERO);
745 
746 		mtx_lock(&ctrlr->lock);
747 		if (ctrlr->num_io_queues != 0) {
748 			mtx_unlock(&ctrlr->lock);
749 			free(io_qpairs, M_NVMFT);
750 			nvmft_send_generic_error(ctrlr->admin, nc,
751 			    NVME_SC_COMMAND_SEQUENCE_ERROR);
752 			nvmf_free_capsule(nc);
753 			return;
754 		}
755 
756 		ctrlr->num_io_queues = num_queues;
757 		ctrlr->io_qpairs = io_qpairs;
758 		mtx_unlock(&ctrlr->lock);
759 
760 		nvmft_init_cqe(&cqe, nc, 0);
761 		cqe.cdw0 = cmd->cdw11;
762 		nvmft_send_response(ctrlr->admin, &cqe);
763 		nvmf_free_capsule(nc);
764 		return;
765 	}
766 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
767 	{
768 		uint32_t aer_mask;
769 
770 		aer_mask = le32toh(cmd->cdw11);
771 
772 		/* Check for any reserved or unimplemented feature bits. */
773 		if ((aer_mask & 0xffffc000) != 0)
774 			goto error;
775 
776 		mtx_lock(&ctrlr->lock);
777 		ctrlr->aer_mask = aer_mask;
778 		mtx_unlock(&ctrlr->lock);
779 		nvmft_send_success(ctrlr->admin, nc);
780 		return;
781 	}
782 	default:
783 		nvmft_printf(ctrlr,
784 		    "Unsupported feature ID %u for SET_FEATURES\n", fid);
785 		goto error;
786 	}
787 
788 error:
789 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
790 	nvmf_free_capsule(nc);
791 }
792 
793 static bool
794 update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
795 {
796 	struct nvmft_port *np = ctrlr->np;
797 	uint32_t changes;
798 
799 	*need_shutdown = false;
800 
801 	mtx_lock(&ctrlr->lock);
802 
803 	/* Don't allow any changes while shutting down. */
804 	if (ctrlr->shutdown) {
805 		mtx_unlock(&ctrlr->lock);
806 		return (false);
807 	}
808 
809 	if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
810 		mtx_unlock(&ctrlr->lock);
811 		return (false);
812 	}
813 
814 	changes = ctrlr->cc ^ new_cc;
815 	ctrlr->cc = new_cc;
816 
817 	/* Handle shutdown requests. */
818 	if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
819 	    NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
820 		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
821 		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
822 		ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
823 		ctrlr->shutdown = true;
824 		*need_shutdown = true;
825 		nvmft_printf(ctrlr, "shutdown requested\n");
826 	}
827 
828 	if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
829 		if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
830 			/* Controller reset. */
831 			nvmft_printf(ctrlr, "reset requested\n");
832 			ctrlr->shutdown = true;
833 			*need_shutdown = true;
834 		} else
835 			ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
836 	}
837 	mtx_unlock(&ctrlr->lock);
838 
839 	return (true);
840 }
841 
842 static void
843 handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
844     const struct nvmf_fabric_prop_get_cmd *pget)
845 {
846 	struct nvmf_fabric_prop_get_rsp rsp;
847 
848 	nvmft_init_cqe(&rsp, nc, 0);
849 
850 	switch (le32toh(pget->ofst)) {
851 	case NVMF_PROP_CAP:
852 		if (pget->attrib.size != NVMF_PROP_SIZE_8)
853 			goto error;
854 		rsp.value.u64 = htole64(ctrlr->np->cap);
855 		break;
856 	case NVMF_PROP_VS:
857 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
858 			goto error;
859 		rsp.value.u32.low = ctrlr->cdata.ver;
860 		break;
861 	case NVMF_PROP_CC:
862 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
863 			goto error;
864 		rsp.value.u32.low = htole32(ctrlr->cc);
865 		break;
866 	case NVMF_PROP_CSTS:
867 		if (pget->attrib.size != NVMF_PROP_SIZE_4)
868 			goto error;
869 		rsp.value.u32.low = htole32(ctrlr->csts);
870 		break;
871 	default:
872 		goto error;
873 	}
874 
875 	nvmft_send_response(ctrlr->admin, &rsp);
876 	return;
877 error:
878 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
879 }
880 
881 static void
882 handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
883     const struct nvmf_fabric_prop_set_cmd *pset)
884 {
885 	bool need_shutdown;
886 
887 	need_shutdown = false;
888 	switch (le32toh(pset->ofst)) {
889 	case NVMF_PROP_CC:
890 		if (pset->attrib.size != NVMF_PROP_SIZE_4)
891 			goto error;
892 		if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
893 		    &need_shutdown))
894 			goto error;
895 		break;
896 	default:
897 		goto error;
898 	}
899 
900 	nvmft_send_success(ctrlr->admin, nc);
901 	if (need_shutdown) {
902 		callout_stop(&ctrlr->ka_timer);
903 		taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
904 	}
905 	return;
906 error:
907 	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
908 }
909 
910 static void
911 handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
912     struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
913 {
914 	switch (fc->fctype) {
915 	case NVMF_FABRIC_COMMAND_PROPERTY_GET:
916 		handle_property_get(ctrlr, nc,
917 		    (const struct nvmf_fabric_prop_get_cmd *)fc);
918 		break;
919 	case NVMF_FABRIC_COMMAND_PROPERTY_SET:
920 		handle_property_set(ctrlr, nc,
921 		    (const struct nvmf_fabric_prop_set_cmd *)fc);
922 		break;
923 	case NVMF_FABRIC_COMMAND_CONNECT:
924 		nvmft_printf(ctrlr,
925 		    "CONNECT command on connected admin queue\n");
926 		nvmft_send_generic_error(ctrlr->admin, nc,
927 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
928 		break;
929 	case NVMF_FABRIC_COMMAND_DISCONNECT:
930 		nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
931 		nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
932 		    NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
933 		break;
934 	default:
935 		nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
936 		    fc->fctype);
937 		nvmft_send_generic_error(ctrlr->admin, nc,
938 		    NVME_SC_INVALID_OPCODE);
939 		break;
940 	}
941 	nvmf_free_capsule(nc);
942 }
943 
944 void
945 nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
946     struct nvmf_capsule *nc)
947 {
948 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
949 
950 	/* Only permit Fabrics commands while a controller is disabled. */
951 	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
952 	    cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
953 		nvmft_printf(ctrlr,
954 		    "Unsupported admin opcode %#x while disabled\n", cmd->opc);
955 		nvmft_send_generic_error(ctrlr->admin, nc,
956 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
957 		nvmf_free_capsule(nc);
958 		return;
959 	}
960 
961 	atomic_store_int(&ctrlr->ka_active_traffic, 1);
962 
963 	switch (cmd->opc) {
964 	case NVME_OPC_GET_LOG_PAGE:
965 		handle_get_log_page(ctrlr, nc, cmd);
966 		break;
967 	case NVME_OPC_IDENTIFY:
968 		handle_identify_command(ctrlr, nc, cmd);
969 		break;
970 	case NVME_OPC_SET_FEATURES:
971 		handle_set_features(ctrlr, nc, cmd);
972 		break;
973 	case NVME_OPC_ASYNC_EVENT_REQUEST:
974 		mtx_lock(&ctrlr->lock);
975 		if (ctrlr->aer_pending == NVMFT_NUM_AER) {
976 			mtx_unlock(&ctrlr->lock);
977 			nvmft_send_error(ctrlr->admin, nc,
978 			    NVME_SCT_COMMAND_SPECIFIC,
979 			    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
980 		} else {
981 			/* NB: Store the CID without byte-swapping. */
982 			ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
983 			ctrlr->aer_pending++;
984 			ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
985 			mtx_unlock(&ctrlr->lock);
986 		}
987 		nvmf_free_capsule(nc);
988 		break;
989 	case NVME_OPC_KEEP_ALIVE:
990 		nvmft_send_success(ctrlr->admin, nc);
991 		nvmf_free_capsule(nc);
992 		break;
993 	case NVME_OPC_FABRICS_COMMANDS:
994 		handle_admin_fabrics_command(ctrlr, nc,
995 		    (const struct nvmf_fabric_cmd *)cmd);
996 		break;
997 	default:
998 		nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
999 		nvmft_send_generic_error(ctrlr->admin, nc,
1000 		    NVME_SC_INVALID_OPCODE);
1001 		nvmf_free_capsule(nc);
1002 		break;
1003 	}
1004 }
1005 
1006 void
1007 nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
1008     struct nvmf_capsule *nc)
1009 {
1010 	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
1011 	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
1012 
1013 	atomic_store_int(&ctrlr->ka_active_traffic, 1);
1014 
1015 	switch (cmd->opc) {
1016 	case NVME_OPC_FLUSH:
1017 		if (cmd->nsid == htole32(0xffffffff)) {
1018 			nvmft_send_generic_error(qp, nc,
1019 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1020 			nvmf_free_capsule(nc);
1021 			break;
1022 		}
1023 		/* FALLTHROUGH */
1024 	case NVME_OPC_WRITE:
1025 	case NVME_OPC_READ:
1026 	case NVME_OPC_WRITE_UNCORRECTABLE:
1027 	case NVME_OPC_COMPARE:
1028 	case NVME_OPC_WRITE_ZEROES:
1029 	case NVME_OPC_DATASET_MANAGEMENT:
1030 	case NVME_OPC_VERIFY:
1031 		nvmft_dispatch_command(qp, nc, false);
1032 		break;
1033 	default:
1034 		nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
1035 		nvmft_send_generic_error(qp, nc,
1036 		    NVME_SC_INVALID_OPCODE);
1037 		nvmf_free_capsule(nc);
1038 		break;
1039 	}
1040 }
1041 
1042 static void
1043 nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
1044     u_int type, uint8_t info, uint8_t log_page_id)
1045 {
1046 	struct nvme_completion cpl;
1047 
1048 	MPASS(type <= 7);
1049 
1050 	/* Drop events that are not enabled. */
1051 	mtx_lock(&ctrlr->lock);
1052 	if ((ctrlr->aer_mask & aer_mask) == 0) {
1053 		mtx_unlock(&ctrlr->lock);
1054 		return;
1055 	}
1056 
1057 	/*
1058 	 * If there is no pending AER command, drop it.
1059 	 * XXX: Should we queue these?
1060 	 */
1061 	if (ctrlr->aer_pending == 0) {
1062 		mtx_unlock(&ctrlr->lock);
1063 		nvmft_printf(ctrlr,
1064 		    "dropping AER type %u, info %#x, page %#x\n",
1065 		    type, info, log_page_id);
1066 		return;
1067 	}
1068 
1069 	memset(&cpl, 0, sizeof(cpl));
1070 	cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
1071 	ctrlr->aer_pending--;
1072 	ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
1073 	mtx_unlock(&ctrlr->lock);
1074 
1075 	cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
1076 	    NVMEF(NVME_ASYNC_EVENT_INFO, info) |
1077 	    NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
1078 
1079 	nvmft_send_response(ctrlr->admin, &cpl);
1080 }
1081 
1082 void
1083 nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
1084 {
1085 	struct nvme_ns_list *nslist;
1086 	uint32_t new_nsid, nsid;
1087 	u_int i;
1088 
1089 	new_nsid = lun_id + 1;
1090 
1091 	mtx_lock(&ctrlr->lock);
1092 	nslist = ctrlr->changed_ns;
1093 
1094 	/* If the first entry is 0xffffffff, the list is already full. */
1095 	if (nslist->ns[0] != 0xffffffff) {
1096 		/* Find the insertion point for this namespace ID. */
1097 		for (i = 0; i < nitems(nslist->ns); i++) {
1098 			nsid = le32toh(nslist->ns[i]);
1099 			if (nsid == new_nsid) {
1100 				/* Already reported, nothing to do. */
1101 				mtx_unlock(&ctrlr->lock);
1102 				return;
1103 			}
1104 
1105 			if (nsid == 0 || nsid > new_nsid)
1106 				break;
1107 		}
1108 
1109 		if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
1110 			/* List is full. */
1111 			memset(ctrlr->changed_ns, 0,
1112 			    sizeof(*ctrlr->changed_ns));
1113 			ctrlr->changed_ns->ns[0] = 0xffffffff;
1114 		} else if (nslist->ns[i] == htole32(0)) {
1115 			/*
1116 			 * Optimize case where this ID is appended to
1117 			 * the end.
1118 			 */
1119 			nslist->ns[i] = htole32(new_nsid);
1120 		} else {
1121 			memmove(&nslist->ns[i + 1], &nslist->ns[i],
1122 			    (nitems(nslist->ns) - i - 1) *
1123 			    sizeof(nslist->ns[0]));
1124 			nslist->ns[i] = htole32(new_nsid);
1125 		}
1126 	}
1127 
1128 	if (ctrlr->changed_ns_reported) {
1129 		mtx_unlock(&ctrlr->lock);
1130 		return;
1131 	}
1132 	ctrlr->changed_ns_reported = true;
1133 	mtx_unlock(&ctrlr->lock);
1134 
1135 	nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
1136 	    NVME_LOG_CHANGED_NAMESPACE);
1137 }
1138