xref: /freebsd/sys/dev/nvmf/host/nvmf_qpair.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/types.h>
9 #include <sys/bus.h>
10 #include <sys/lock.h>
11 #include <sys/malloc.h>
12 #include <sys/mutex.h>
13 #include <sys/nv.h>
14 #include <sys/sysctl.h>
15 #include <dev/nvme/nvme.h>
16 #include <dev/nvmf/nvmf.h>
17 #include <dev/nvmf/nvmf_transport.h>
18 #include <dev/nvmf/host/nvmf_var.h>
19 
20 struct nvmf_host_command {
21 	struct nvmf_request *req;
22 	TAILQ_ENTRY(nvmf_host_command) link;
23 	uint16_t cid;
24 };
25 
26 struct nvmf_host_qpair {
27 	struct nvmf_softc *sc;
28 	struct nvmf_qpair *qp;
29 
30 	bool	sq_flow_control;
31 	bool	shutting_down;
32 	u_int	allocating;
33 	u_int	num_commands;
34 	uint16_t sqhd;
35 	uint16_t sqtail;
36 	uint64_t submitted;
37 
38 	struct mtx lock;
39 
40 	TAILQ_HEAD(, nvmf_host_command) free_commands;
41 	STAILQ_HEAD(, nvmf_request) pending_requests;
42 
43 	/* Indexed by cid. */
44 	struct nvmf_host_command **active_commands;
45 
46 	char	name[16];
47 	struct sysctl_ctx_list sysctl_ctx;
48 };
49 
50 struct nvmf_request *
51 nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe,
52     nvmf_request_complete_t *cb, void *cb_arg, int how)
53 {
54 	struct nvmf_request *req;
55 	struct nvmf_qpair *nq;
56 
57 	KASSERT(how == M_WAITOK || how == M_NOWAIT,
58 	    ("%s: invalid how", __func__));
59 
60 	req = malloc(sizeof(*req), M_NVMF, how | M_ZERO);
61 	if (req == NULL)
62 		return (NULL);
63 
64 	mtx_lock(&qp->lock);
65 	nq = qp->qp;
66 	if (nq == NULL) {
67 		mtx_unlock(&qp->lock);
68 		free(req, M_NVMF);
69 		return (NULL);
70 	}
71 	qp->allocating++;
72 	MPASS(qp->allocating != 0);
73 	mtx_unlock(&qp->lock);
74 
75 	req->qp = qp;
76 	req->cb = cb;
77 	req->cb_arg = cb_arg;
78 	req->nc = nvmf_allocate_command(nq, sqe, how);
79 	if (req->nc == NULL) {
80 		free(req, M_NVMF);
81 		req = NULL;
82 	}
83 
84 	mtx_lock(&qp->lock);
85 	qp->allocating--;
86 	if (qp->allocating == 0 && qp->shutting_down)
87 		wakeup(qp);
88 	mtx_unlock(&qp->lock);
89 
90 	return (req);
91 }
92 
93 static void
94 nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
95 {
96 	struct nvme_completion cqe;
97 
98 	memset(&cqe, 0, sizeof(cqe));
99 	cqe.cid = cid;
100 	cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) |
101 	    NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
102 	req->cb(req->cb_arg, &cqe);
103 }
104 
105 void
106 nvmf_free_request(struct nvmf_request *req)
107 {
108 	if (req->nc != NULL)
109 		nvmf_free_capsule(req->nc);
110 	free(req, M_NVMF);
111 }
112 
113 static void
114 nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
115 {
116 	struct nvmf_softc *sc = qp->sc;
117 	struct nvme_command *sqe;
118 	struct nvmf_capsule *nc;
119 	uint16_t new_sqtail;
120 	int error;
121 
122 	mtx_assert(&qp->lock, MA_OWNED);
123 
124 	qp->submitted++;
125 
126 	/*
127 	 * Update flow control tracking.  This is just a sanity check.
128 	 * Since num_commands == qsize - 1, there can never be too
129 	 * many commands in flight.
130 	 */
131 	new_sqtail = (qp->sqtail + 1) % (qp->num_commands + 1);
132 	KASSERT(new_sqtail != qp->sqhd, ("%s: qp %p is full", __func__, qp));
133 	qp->sqtail = new_sqtail;
134 	mtx_unlock(&qp->lock);
135 
136 	nc = cmd->req->nc;
137 	sqe = nvmf_capsule_sqe(nc);
138 
139 	/*
140 	 * NB: Don't bother byte-swapping the cid so that receive
141 	 * doesn't have to swap.
142 	 */
143 	sqe->cid = cmd->cid;
144 
145 	error = nvmf_transmit_capsule(nc);
146 	if (error != 0) {
147 		device_printf(sc->dev,
148 		    "failed to transmit capsule: %d, disconnecting\n", error);
149 		nvmf_disconnect(sc);
150 		return;
151 	}
152 
153 	if (sc->ka_traffic)
154 		atomic_store_int(&sc->ka_active_tx_traffic, 1);
155 }
156 
157 static void
158 nvmf_qp_error(void *arg, int error)
159 {
160 	struct nvmf_host_qpair *qp = arg;
161 	struct nvmf_softc *sc = qp->sc;
162 
163 	/* Ignore simple close of queue pairs during shutdown. */
164 	if (!(sc->detaching && error == 0))
165 		device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
166 		    qp->name);
167 	nvmf_disconnect(sc);
168 }
169 
170 static void
171 nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
172 {
173 	struct nvmf_host_qpair *qp = arg;
174 	struct nvmf_softc *sc = qp->sc;
175 	struct nvmf_host_command *cmd;
176 	struct nvmf_request *req;
177 	const struct nvme_completion *cqe;
178 	uint16_t cid;
179 
180 	cqe = nvmf_capsule_cqe(nc);
181 
182 	if (sc->ka_traffic)
183 		atomic_store_int(&sc->ka_active_rx_traffic, 1);
184 
185 	/*
186 	 * NB: Don't bother byte-swapping the cid as transmit doesn't
187 	 * swap either.
188 	 */
189 	cid = cqe->cid;
190 
191 	if (cid > qp->num_commands) {
192 		device_printf(sc->dev,
193 		    "received invalid CID %u, disconnecting\n", cid);
194 		nvmf_disconnect(sc);
195 		nvmf_free_capsule(nc);
196 		return;
197 	}
198 
199 	/* Update flow control tracking. */
200 	mtx_lock(&qp->lock);
201 	if (qp->sq_flow_control) {
202 		if (nvmf_sqhd_valid(nc))
203 			qp->sqhd = le16toh(cqe->sqhd);
204 	} else {
205 		/*
206 		 * If SQ FC is disabled, just advance the head for
207 		 * each response capsule received.
208 		 */
209 		qp->sqhd = (qp->sqhd + 1) % (qp->num_commands + 1);
210 	}
211 
212 	/*
213 	 * If the queue has been shutdown due to an error, silently
214 	 * drop the response.
215 	 */
216 	if (qp->qp == NULL) {
217 		device_printf(sc->dev,
218 		    "received completion for CID %u on shutdown %s\n", cid,
219 		    qp->name);
220 		mtx_unlock(&qp->lock);
221 		nvmf_free_capsule(nc);
222 		return;
223 	}
224 
225 	cmd = qp->active_commands[cid];
226 	if (cmd == NULL) {
227 		mtx_unlock(&qp->lock);
228 		device_printf(sc->dev,
229 		    "received completion for inactive CID %u, disconnecting\n",
230 		    cid);
231 		nvmf_disconnect(sc);
232 		nvmf_free_capsule(nc);
233 		return;
234 	}
235 
236 	KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
237 	req = cmd->req;
238 	cmd->req = NULL;
239 	if (STAILQ_EMPTY(&qp->pending_requests)) {
240 		qp->active_commands[cid] = NULL;
241 		TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
242 		mtx_unlock(&qp->lock);
243 	} else {
244 		cmd->req = STAILQ_FIRST(&qp->pending_requests);
245 		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
246 		nvmf_dispatch_command(qp, cmd);
247 	}
248 
249 	req->cb(req->cb_arg, cqe);
250 	nvmf_free_capsule(nc);
251 	nvmf_free_request(req);
252 }
253 
254 static void
255 nvmf_sysctls_qp(struct nvmf_softc *sc, struct nvmf_host_qpair *qp,
256     bool admin, u_int qid)
257 {
258 	struct sysctl_ctx_list *ctx = &qp->sysctl_ctx;
259 	struct sysctl_oid *oid;
260 	struct sysctl_oid_list *list;
261 	char name[8];
262 
263 	if (admin) {
264 		oid = SYSCTL_ADD_NODE(ctx,
265 		    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO,
266 		    "adminq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue");
267 	} else {
268 		snprintf(name, sizeof(name), "%u", qid);
269 		oid = SYSCTL_ADD_NODE(ctx, sc->ioq_oid_list, OID_AUTO, name,
270 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queue");
271 	}
272 	list = SYSCTL_CHILDREN(oid);
273 
274 	SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "num_entries", CTLFLAG_RD,
275 	    NULL, qp->num_commands + 1, "Number of entries in queue");
276 	SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_head", CTLFLAG_RD, &qp->sqhd,
277 	    0, "Current head of submission queue (as observed by driver)");
278 	SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_tail", CTLFLAG_RD, &qp->sqtail,
279 	    0, "Current tail of submission queue (as observed by driver)");
280 	SYSCTL_ADD_U64(ctx, list, OID_AUTO, "num_cmds", CTLFLAG_RD,
281 	    &qp->submitted, 0, "Number of commands submitted");
282 }
283 
284 struct nvmf_host_qpair *
285 nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
286     const nvlist_t *nvl, const char *name, u_int qid)
287 {
288 	struct nvmf_host_command *cmd, *ncmd;
289 	struct nvmf_host_qpair *qp;
290 	u_int i;
291 	bool admin;
292 
293 	admin = nvlist_get_bool(nvl, "admin");
294 	qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
295 	qp->sc = sc;
296 	qp->sq_flow_control = nvlist_get_bool(nvl, "sq_flow_control");
297 	qp->sqhd = nvlist_get_number(nvl, "sqhd");
298 	qp->sqtail = nvlist_get_number(nvl, "sqtail");
299 	strlcpy(qp->name, name, sizeof(qp->name));
300 	mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
301 	(void)sysctl_ctx_init(&qp->sysctl_ctx);
302 
303 	/*
304 	 * Allocate a spare command slot for each pending AER command
305 	 * on the admin queue.
306 	 */
307 	qp->num_commands = nvlist_get_number(nvl, "qsize") - 1;
308 	if (admin)
309 		qp->num_commands += sc->num_aer;
310 
311 	qp->active_commands = malloc(sizeof(*qp->active_commands) *
312 	    qp->num_commands, M_NVMF, M_WAITOK | M_ZERO);
313 	TAILQ_INIT(&qp->free_commands);
314 	for (i = 0; i < qp->num_commands; i++) {
315 		cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO);
316 		cmd->cid = i;
317 		TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
318 	}
319 	STAILQ_INIT(&qp->pending_requests);
320 
321 	qp->qp = nvmf_allocate_qpair(trtype, false, nvl, nvmf_qp_error, qp,
322 	    nvmf_receive_capsule, qp);
323 	if (qp->qp == NULL) {
324 		(void)sysctl_ctx_free(&qp->sysctl_ctx);
325 		TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
326 			TAILQ_REMOVE(&qp->free_commands, cmd, link);
327 			free(cmd, M_NVMF);
328 		}
329 		free(qp->active_commands, M_NVMF);
330 		mtx_destroy(&qp->lock);
331 		free(qp, M_NVMF);
332 		return (NULL);
333 	}
334 
335 	nvmf_sysctls_qp(sc, qp, admin, qid);
336 
337 	return (qp);
338 }
339 
340 void
341 nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
342 {
343 	struct nvmf_host_command *cmd;
344 	struct nvmf_request *req;
345 	struct nvmf_qpair *nq;
346 
347 	mtx_lock(&qp->lock);
348 	nq = qp->qp;
349 	qp->qp = NULL;
350 
351 	if (nq == NULL) {
352 		while (qp->shutting_down)
353 			mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
354 		mtx_unlock(&qp->lock);
355 		return;
356 	}
357 	qp->shutting_down = true;
358 	while (qp->allocating != 0)
359 		mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
360 	mtx_unlock(&qp->lock);
361 
362 	nvmf_free_qpair(nq);
363 
364 	/*
365 	 * Abort outstanding requests.  Active requests will have
366 	 * their I/O completions invoked and associated capsules freed
367 	 * by the transport layer via nvmf_free_qpair.  Pending
368 	 * requests must have their I/O completion invoked via
369 	 * nvmf_abort_capsule_data.
370 	 */
371 	for (u_int i = 0; i < qp->num_commands; i++) {
372 		cmd = qp->active_commands[i];
373 		if (cmd != NULL) {
374 			if (!cmd->req->aer)
375 				printf("%s: aborted active command %p (CID %u)\n",
376 				    __func__, cmd->req, cmd->cid);
377 
378 			/* This was freed by nvmf_free_qpair. */
379 			cmd->req->nc = NULL;
380 			nvmf_abort_request(cmd->req, cmd->cid);
381 			nvmf_free_request(cmd->req);
382 			free(cmd, M_NVMF);
383 		}
384 	}
385 	while (!STAILQ_EMPTY(&qp->pending_requests)) {
386 		req = STAILQ_FIRST(&qp->pending_requests);
387 		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
388 		if (!req->aer)
389 			printf("%s: aborted pending command %p\n", __func__,
390 			    req);
391 		nvmf_abort_capsule_data(req->nc, ECONNABORTED);
392 		nvmf_abort_request(req, 0);
393 		nvmf_free_request(req);
394 	}
395 
396 	mtx_lock(&qp->lock);
397 	qp->shutting_down = false;
398 	mtx_unlock(&qp->lock);
399 	wakeup(qp);
400 }
401 
402 void
403 nvmf_destroy_qp(struct nvmf_host_qpair *qp)
404 {
405 	struct nvmf_host_command *cmd, *ncmd;
406 
407 	nvmf_shutdown_qp(qp);
408 	(void)sysctl_ctx_free(&qp->sysctl_ctx);
409 
410 	TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
411 		TAILQ_REMOVE(&qp->free_commands, cmd, link);
412 		free(cmd, M_NVMF);
413 	}
414 	free(qp->active_commands, M_NVMF);
415 	mtx_destroy(&qp->lock);
416 	free(qp, M_NVMF);
417 }
418 
419 void
420 nvmf_submit_request(struct nvmf_request *req)
421 {
422 	struct nvmf_host_qpair *qp;
423 	struct nvmf_host_command *cmd;
424 
425 	qp = req->qp;
426 	mtx_lock(&qp->lock);
427 	if (qp->qp == NULL) {
428 		mtx_unlock(&qp->lock);
429 		printf("%s: aborted pending command %p\n", __func__, req);
430 		nvmf_abort_capsule_data(req->nc, ECONNABORTED);
431 		nvmf_abort_request(req, 0);
432 		nvmf_free_request(req);
433 		return;
434 	}
435 	cmd = TAILQ_FIRST(&qp->free_commands);
436 	if (cmd == NULL) {
437 		/*
438 		 * Queue this request.  Will be sent after enough
439 		 * in-flight requests have completed.
440 		 */
441 		STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
442 		mtx_unlock(&qp->lock);
443 		return;
444 	}
445 
446 	TAILQ_REMOVE(&qp->free_commands, cmd, link);
447 	KASSERT(qp->active_commands[cmd->cid] == NULL,
448 	    ("%s: CID already busy", __func__));
449 	qp->active_commands[cmd->cid] = cmd;
450 	cmd->req = req;
451 	nvmf_dispatch_command(qp, cmd);
452 }
453