xref: /freebsd/sys/dev/nvmf/nvmf_tcp.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/capsicum.h>
10 #include <sys/condvar.h>
11 #include <sys/file.h>
12 #include <sys/gsb_crc32.h>
13 #include <sys/kernel.h>
14 #include <sys/kthread.h>
15 #include <sys/limits.h>
16 #include <sys/lock.h>
17 #include <sys/malloc.h>
18 #include <sys/mbuf.h>
19 #include <sys/module.h>
20 #include <sys/mutex.h>
21 #include <sys/nv.h>
22 #include <sys/protosw.h>
23 #include <sys/refcount.h>
24 #include <sys/socket.h>
25 #include <sys/socketvar.h>
26 #include <sys/sysctl.h>
27 #include <sys/uio.h>
28 #include <netinet/in.h>
29 #include <dev/nvme/nvme.h>
30 #include <dev/nvmf/nvmf.h>
31 #include <dev/nvmf/nvmf_proto.h>
32 #include <dev/nvmf/nvmf_tcp.h>
33 #include <dev/nvmf/nvmf_transport.h>
34 #include <dev/nvmf/nvmf_transport_internal.h>
35 
36 struct nvmf_tcp_capsule;
37 struct nvmf_tcp_qpair;
38 
39 struct nvmf_tcp_command_buffer {
40 	struct nvmf_tcp_qpair *qp;
41 
42 	struct nvmf_io_request io;
43 	size_t	data_len;
44 	size_t	data_xfered;
45 	uint32_t data_offset;
46 
47 	u_int	refs;
48 	int	error;
49 
50 	uint16_t cid;
51 	uint16_t ttag;
52 
53 	TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
54 
55 	/* Controller only */
56 	struct nvmf_tcp_capsule *tc;
57 };
58 
59 struct nvmf_tcp_command_buffer_list {
60 	TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
61 	struct mtx lock;
62 };
63 
64 struct nvmf_tcp_qpair {
65 	struct nvmf_qpair qp;
66 
67 	struct socket *so;
68 
69 	volatile u_int refs;	/* Every allocated capsule holds a reference */
70 	uint8_t	txpda;
71 	uint8_t rxpda;
72 	bool header_digests;
73 	bool data_digests;
74 	uint32_t maxr2t;
75 	uint32_t maxh2cdata;	/* Controller only */
76 	uint32_t max_tx_data;
77 	uint32_t max_icd;	/* Host only */
78 	uint16_t next_ttag;	/* Controller only */
79 	u_int num_ttags;	/* Controller only */
80 	u_int active_ttags;	/* Controller only */
81 	bool send_success;	/* Controller only */
82 
83 	/* Receive state. */
84 	struct thread *rx_thread;
85 	struct cv rx_cv;
86 	bool	rx_shutdown;
87 
88 	/* Transmit state. */
89 	struct thread *tx_thread;
90 	struct cv tx_cv;
91 	bool	tx_shutdown;
92 	struct mbufq tx_pdus;
93 	STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
94 
95 	struct nvmf_tcp_command_buffer_list tx_buffers;
96 	struct nvmf_tcp_command_buffer_list rx_buffers;
97 
98 	/*
99 	 * For the controller, an RX command buffer can be in one of
100 	 * two locations, all protected by the rx_buffers.lock.  If a
101 	 * receive request is waiting for either an R2T slot for its
102 	 * command (due to exceeding MAXR2T), or a transfer tag it is
103 	 * placed on the rx_buffers list.  When a request is allocated
104 	 * an active transfer tag, it moves to the open_ttags[] array
105 	 * (indexed by the tag) until it completes.
106 	 */
107 	struct nvmf_tcp_command_buffer **open_ttags;	/* Controller only */
108 };
109 
110 struct nvmf_tcp_rxpdu {
111 	struct mbuf *m;
112 	const struct nvme_tcp_common_pdu_hdr *hdr;
113 	uint32_t data_len;
114 	bool data_digest_mismatch;
115 };
116 
117 struct nvmf_tcp_capsule {
118 	struct nvmf_capsule nc;
119 
120 	volatile u_int refs;
121 
122 	struct nvmf_tcp_rxpdu rx_pdu;
123 
124 	uint32_t active_r2ts;		/* Controller only */
125 #ifdef INVARIANTS
126 	uint32_t tx_data_offset;	/* Controller only */
127 	u_int pending_r2ts;		/* Controller only */
128 #endif
129 
130 	STAILQ_ENTRY(nvmf_tcp_capsule) link;
131 };
132 
133 #define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
134 #define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
135 
136 static void	tcp_release_capsule(struct nvmf_tcp_capsule *tc);
137 static void	tcp_free_qpair(struct nvmf_qpair *nq);
138 
139 SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
140     "TCP transport");
141 static u_int tcp_max_transmit_data = 256 * 1024;
142 SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN,
143     &tcp_max_transmit_data, 0,
144     "Maximum size of data payload in a transmitted PDU");
145 
146 static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
147 
148 static int
149 mbuf_crc32c_helper(void *arg, void *data, u_int len)
150 {
151 	uint32_t *digestp = arg;
152 
153 	*digestp = calculate_crc32c(*digestp, data, len);
154 	return (0);
155 }
156 
157 static uint32_t
158 mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
159 {
160 	uint32_t digest = 0xffffffff;
161 
162 	m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
163 	digest = digest ^ 0xffffffff;
164 
165 	return (digest);
166 }
167 
168 static uint32_t
169 compute_digest(const void *buf, size_t len)
170 {
171 	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
172 }
173 
174 static struct nvmf_tcp_command_buffer *
175 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
176     const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
177     uint16_t cid)
178 {
179 	struct nvmf_tcp_command_buffer *cb;
180 
181 	cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
182 	cb->qp = qp;
183 	cb->io = *io;
184 	cb->data_offset = data_offset;
185 	cb->data_len = data_len;
186 	cb->data_xfered = 0;
187 	refcount_init(&cb->refs, 1);
188 	cb->error = 0;
189 	cb->cid = cid;
190 	cb->ttag = 0;
191 	cb->tc = NULL;
192 
193 	return (cb);
194 }
195 
196 static void
197 tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
198 {
199 	refcount_acquire(&cb->refs);
200 }
201 
202 static void
203 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
204 {
205 	nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
206 	if (cb->tc != NULL)
207 		tcp_release_capsule(cb->tc);
208 	free(cb, M_NVMF_TCP);
209 }
210 
211 static void
212 tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
213 {
214 	if (refcount_release(&cb->refs))
215 		tcp_free_command_buffer(cb);
216 }
217 
218 static void
219 tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
220     struct nvmf_tcp_command_buffer *cb)
221 {
222 	mtx_assert(&list->lock, MA_OWNED);
223 	TAILQ_INSERT_HEAD(&list->head, cb, link);
224 }
225 
226 static struct nvmf_tcp_command_buffer *
227 tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
228     uint16_t cid, uint16_t ttag)
229 {
230 	struct nvmf_tcp_command_buffer *cb;
231 
232 	mtx_assert(&list->lock, MA_OWNED);
233 	TAILQ_FOREACH(cb, &list->head, link) {
234 		if (cb->cid == cid && cb->ttag == ttag)
235 			return (cb);
236 	}
237 	return (NULL);
238 }
239 
240 static void
241 tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
242     struct nvmf_tcp_command_buffer *cb)
243 {
244 	mtx_assert(&list->lock, MA_OWNED);
245 	TAILQ_REMOVE(&list->head, cb, link);
246 }
247 
248 static void
249 tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
250     uint16_t cid, uint16_t ttag)
251 {
252 	struct nvmf_tcp_command_buffer *cb;
253 
254 	mtx_lock(&list->lock);
255 	cb = tcp_find_command_buffer(list, cid, ttag);
256 	if (cb != NULL) {
257 		tcp_remove_command_buffer(list, cb);
258 		mtx_unlock(&list->lock);
259 		tcp_release_command_buffer(cb);
260 	} else
261 		mtx_unlock(&list->lock);
262 }
263 
264 static void
265 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m)
266 {
267 	struct socket *so = qp->so;
268 
269 	SOCKBUF_LOCK(&so->so_snd);
270 	mbufq_enqueue(&qp->tx_pdus, m);
271 	/* XXX: Do we need to handle sb_hiwat being wrong? */
272 	if (sowriteable(so))
273 		cv_signal(&qp->tx_cv);
274 	SOCKBUF_UNLOCK(&so->so_snd);
275 }
276 
277 static void
278 nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
279     struct mbuf *rx_pdu, u_int hlen)
280 {
281 	struct nvme_tcp_term_req_hdr *hdr;
282 	struct mbuf *m;
283 
284 	if (hlen != 0) {
285 		hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
286 		hlen = min(hlen, m_length(rx_pdu, NULL));
287 	}
288 
289 	m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
290 	m->m_len = sizeof(*hdr) + hlen;
291 	hdr = mtod(m, void *);
292 	memset(hdr, 0, sizeof(*hdr));
293 	hdr->common.pdu_type = qp->qp.nq_controller ?
294 	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
295 	hdr->common.hlen = sizeof(*hdr);
296 	hdr->common.plen = sizeof(*hdr) + hlen;
297 	hdr->fes = htole16(fes);
298 	le32enc(hdr->fei, fei);
299 	if (hlen != 0)
300 		m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
301 
302 	nvmf_tcp_write_pdu(qp, m);
303 }
304 
305 static int
306 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
307 {
308 	const struct nvme_tcp_common_pdu_hdr *ch;
309 	struct mbuf *m = pdu->m;
310 	uint32_t data_len, fei, plen;
311 	uint32_t digest, rx_digest;
312 	u_int hlen;
313 	int error;
314 	uint16_t fes;
315 
316 	/* Determine how large of a PDU header to return for errors. */
317 	ch = pdu->hdr;
318 	hlen = ch->hlen;
319 	plen = le32toh(ch->plen);
320 	if (hlen < sizeof(*ch) || hlen > plen)
321 		hlen = sizeof(*ch);
322 
323 	error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
324 	    qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
325 	    &fei);
326 	if (error != 0) {
327 		if (error != ECONNRESET)
328 			nvmf_tcp_report_error(qp, fes, fei, m, hlen);
329 		return (error);
330 	}
331 
332 	/* Check header digest if present. */
333 	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
334 		digest = mbuf_crc32c(m, 0, ch->hlen);
335 		m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
336 		if (digest != rx_digest) {
337 			printf("NVMe/TCP: Header digest mismatch\n");
338 			nvmf_tcp_report_error(qp,
339 			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
340 			    hlen);
341 			return (EBADMSG);
342 		}
343 	}
344 
345 	/* Check data digest if present. */
346 	pdu->data_digest_mismatch = false;
347 	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
348 		digest = mbuf_crc32c(m, ch->pdo, data_len);
349 		m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
350 		    (caddr_t)&rx_digest);
351 		if (digest != rx_digest) {
352 			printf("NVMe/TCP: Data digest mismatch\n");
353 			pdu->data_digest_mismatch = true;
354 		}
355 	}
356 
357 	pdu->data_len = data_len;
358 	return (0);
359 }
360 
361 static void
362 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
363 {
364 	m_freem(pdu->m);
365 	pdu->m = NULL;
366 	pdu->hdr = NULL;
367 }
368 
369 static int
370 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
371 {
372 	const struct nvme_tcp_term_req_hdr *hdr;
373 
374 	hdr = (const void *)pdu->hdr;
375 
376 	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
377 	    le16toh(hdr->fes), le32dec(hdr->fei));
378 	nvmf_tcp_free_pdu(pdu);
379 	return (ECONNRESET);
380 }
381 
382 static int
383 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
384     struct nvmf_tcp_rxpdu *pdu)
385 {
386 	const struct nvme_tcp_cmd *cmd;
387 	struct nvmf_capsule *nc;
388 	struct nvmf_tcp_capsule *tc;
389 
390 	cmd = (const void *)pdu->hdr;
391 
392 	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
393 
394 	tc = TCAP(nc);
395 	tc->rx_pdu = *pdu;
396 
397 	nvmf_capsule_received(&qp->qp, nc);
398 	return (0);
399 }
400 
401 static int
402 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
403     struct nvmf_tcp_rxpdu *pdu)
404 {
405 	const struct nvme_tcp_rsp *rsp;
406 	struct nvmf_capsule *nc;
407 	struct nvmf_tcp_capsule *tc;
408 
409 	rsp = (const void *)pdu->hdr;
410 
411 	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
412 
413 	nc->nc_sqhd_valid = true;
414 	tc = TCAP(nc);
415 	tc->rx_pdu = *pdu;
416 
417 	/*
418 	 * Once the CQE has been received, no further transfers to the
419 	 * command buffer for the associated CID can occur.
420 	 */
421 	tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
422 	tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
423 
424 	nvmf_capsule_received(&qp->qp, nc);
425 	return (0);
426 }
427 
428 /*
429  * Construct a PDU that contains an optional data payload.  This
430  * includes dealing with digests and the length fields in the common
431  * header.
432  */
433 static struct mbuf *
434 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
435     struct mbuf *data, uint32_t data_len)
436 {
437 	struct nvme_tcp_common_pdu_hdr *ch;
438 	struct mbuf *top;
439 	uint32_t digest, pad, pdo, plen, mlen;
440 
441 	plen = hlen;
442 	if (qp->header_digests)
443 		plen += sizeof(digest);
444 	if (data_len != 0) {
445 		KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
446 		pdo = roundup(plen, qp->txpda);
447 		pad = pdo - plen;
448 		plen = pdo + data_len;
449 		if (qp->data_digests)
450 			plen += sizeof(digest);
451 		mlen = pdo;
452 	} else {
453 		KASSERT(data == NULL, ("payload mbuf with zero length"));
454 		pdo = 0;
455 		pad = 0;
456 		mlen = plen;
457 	}
458 
459 	top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
460 	top->m_len = mlen;
461 	ch = mtod(top, void *);
462 	memcpy(ch, hdr, hlen);
463 	ch->hlen = hlen;
464 	if (qp->header_digests)
465 		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
466 	if (qp->data_digests && data_len != 0)
467 		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
468 	ch->pdo = pdo;
469 	ch->plen = htole32(plen);
470 
471 	/* HDGST */
472 	if (qp->header_digests) {
473 		digest = compute_digest(ch, hlen);
474 		memcpy((char *)ch + hlen, &digest, sizeof(digest));
475 	}
476 
477 	if (pad != 0) {
478 		/* PAD */
479 		memset((char *)ch + pdo - pad, 0, pad);
480 	}
481 
482 	if (data_len != 0) {
483 		/* DATA */
484 		top->m_next = data;
485 
486 		/* DDGST */
487 		if (qp->data_digests) {
488 			digest = mbuf_crc32c(data, 0, data_len);
489 
490 			/* XXX: Can't use m_append as it uses M_NOWAIT. */
491 			while (data->m_next != NULL)
492 				data = data->m_next;
493 
494 			data->m_next = m_get(M_WAITOK, MT_DATA);
495 			data->m_next->m_len = sizeof(digest);
496 			memcpy(mtod(data->m_next, void *), &digest,
497 			    sizeof(digest));
498 		}
499 	}
500 
501 	return (top);
502 }
503 
504 /* Find the next command buffer eligible to schedule for R2T. */
505 static struct nvmf_tcp_command_buffer *
506 nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
507 {
508 	struct nvmf_tcp_command_buffer *cb;
509 
510 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
511 	MPASS(qp->active_ttags < qp->num_ttags);
512 
513 	TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
514 		/* NB: maxr2t is 0's based. */
515 		if (cb->tc->active_r2ts > qp->maxr2t)
516 			continue;
517 #ifdef INVARIANTS
518 		cb->tc->pending_r2ts--;
519 #endif
520 		TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
521 		return (cb);
522 	}
523 	return (NULL);
524 }
525 
526 /* Allocate the next free transfer tag and assign it to cb. */
527 static void
528 nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
529     struct nvmf_tcp_command_buffer *cb)
530 {
531 	uint16_t ttag;
532 
533 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
534 
535 	ttag = qp->next_ttag;
536 	for (;;) {
537 		if (qp->open_ttags[ttag] == NULL)
538 			break;
539 		if (ttag == qp->num_ttags - 1)
540 			ttag = 0;
541 		else
542 			ttag++;
543 		MPASS(ttag != qp->next_ttag);
544 	}
545 	if (ttag == qp->num_ttags - 1)
546 		qp->next_ttag = 0;
547 	else
548 		qp->next_ttag = ttag + 1;
549 
550 	cb->tc->active_r2ts++;
551 	qp->active_ttags++;
552 	qp->open_ttags[ttag] = cb;
553 
554 	/*
555 	 * Don't bother byte-swapping ttag as it is just a cookie
556 	 * value returned by the other end as-is.
557 	 */
558 	cb->ttag = ttag;
559 }
560 
561 /* NB: cid and ttag are both little-endian already. */
562 static void
563 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
564     uint32_t data_offset, uint32_t data_len)
565 {
566 	struct nvme_tcp_r2t_hdr r2t;
567 	struct mbuf *m;
568 
569 	memset(&r2t, 0, sizeof(r2t));
570 	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
571 	r2t.cccid = cid;
572 	r2t.ttag = ttag;
573 	r2t.r2to = htole32(data_offset);
574 	r2t.r2tl = htole32(data_len);
575 
576 	m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
577 	nvmf_tcp_write_pdu(qp, m);
578 }
579 
580 /*
581  * Release a transfer tag and schedule another R2T.
582  *
583  * NB: This drops the rx_buffers.lock mutex.
584  */
585 static void
586 nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
587     struct nvmf_tcp_command_buffer *cb)
588 {
589 	struct nvmf_tcp_command_buffer *ncb;
590 
591 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
592 	MPASS(qp->open_ttags[cb->ttag] == cb);
593 
594 	/* Release this transfer tag. */
595 	qp->open_ttags[cb->ttag] = NULL;
596 	qp->active_ttags--;
597 	cb->tc->active_r2ts--;
598 
599 	/* Schedule another R2T. */
600 	ncb = nvmf_tcp_next_r2t(qp);
601 	if (ncb != NULL) {
602 		nvmf_tcp_allocate_ttag(qp, ncb);
603 		mtx_unlock(&qp->rx_buffers.lock);
604 		tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
605 		    ncb->data_len);
606 	} else
607 		mtx_unlock(&qp->rx_buffers.lock);
608 }
609 
610 /*
611  * Copy len bytes starting at offset skip from an mbuf chain into an
612  * I/O buffer at destination offset io_offset.
613  */
614 static void
615 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
616     struct nvmf_io_request *io, u_int io_offset)
617 {
618 	u_int todo;
619 
620 	while (m->m_len <= skip) {
621 		skip -= m->m_len;
622 		m = m->m_next;
623 	}
624 	while (len != 0) {
625 		MPASS((m->m_flags & M_EXTPG) == 0);
626 
627 		todo = min(m->m_len - skip, len);
628 		memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
629 		skip = 0;
630 		io_offset += todo;
631 		len -= todo;
632 		m = m->m_next;
633 	}
634 }
635 
636 static int
637 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
638 {
639 	const struct nvme_tcp_h2c_data_hdr *h2c;
640 	struct nvmf_tcp_command_buffer *cb;
641 	uint32_t data_len, data_offset;
642 	uint16_t ttag;
643 
644 	h2c = (const void *)pdu->hdr;
645 	if (le32toh(h2c->datal) > qp->maxh2cdata) {
646 		nvmf_tcp_report_error(qp,
647 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
648 		    pdu->m, pdu->hdr->hlen);
649 		nvmf_tcp_free_pdu(pdu);
650 		return (EBADMSG);
651 	}
652 
653 	/*
654 	 * NB: Don't bother byte-swapping ttag as we don't byte-swap
655 	 * it when sending.
656 	 */
657 	ttag = h2c->ttag;
658 	if (ttag >= qp->num_ttags) {
659 		nvmf_tcp_report_error(qp,
660 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
661 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
662 		    pdu->hdr->hlen);
663 		nvmf_tcp_free_pdu(pdu);
664 		return (EBADMSG);
665 	}
666 
667 	mtx_lock(&qp->rx_buffers.lock);
668 	cb = qp->open_ttags[ttag];
669 	if (cb == NULL) {
670 		mtx_unlock(&qp->rx_buffers.lock);
671 		nvmf_tcp_report_error(qp,
672 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
673 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
674 		    pdu->hdr->hlen);
675 		nvmf_tcp_free_pdu(pdu);
676 		return (EBADMSG);
677 	}
678 	MPASS(cb->ttag == ttag);
679 
680 	/* For a data digest mismatch, fail the I/O request. */
681 	if (pdu->data_digest_mismatch) {
682 		nvmf_tcp_send_next_r2t(qp, cb);
683 		cb->error = EINTEGRITY;
684 		tcp_release_command_buffer(cb);
685 		nvmf_tcp_free_pdu(pdu);
686 		return (0);
687 	}
688 
689 	data_len = le32toh(h2c->datal);
690 	if (data_len != pdu->data_len) {
691 		mtx_unlock(&qp->rx_buffers.lock);
692 		nvmf_tcp_report_error(qp,
693 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
694 		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
695 		    pdu->hdr->hlen);
696 		nvmf_tcp_free_pdu(pdu);
697 		return (EBADMSG);
698 	}
699 
700 	data_offset = le32toh(h2c->datao);
701 	if (data_offset < cb->data_offset ||
702 	    data_offset + data_len > cb->data_offset + cb->data_len) {
703 		mtx_unlock(&qp->rx_buffers.lock);
704 		nvmf_tcp_report_error(qp,
705 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
706 		    pdu->hdr->hlen);
707 		nvmf_tcp_free_pdu(pdu);
708 		return (EBADMSG);
709 	}
710 
711 	if (data_offset != cb->data_offset + cb->data_xfered) {
712 		mtx_unlock(&qp->rx_buffers.lock);
713 		nvmf_tcp_report_error(qp,
714 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
715 		    pdu->hdr->hlen);
716 		nvmf_tcp_free_pdu(pdu);
717 		return (EBADMSG);
718 	}
719 
720 	if ((cb->data_xfered + data_len == cb->data_len) !=
721 	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
722 		mtx_unlock(&qp->rx_buffers.lock);
723 		nvmf_tcp_report_error(qp,
724 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
725 		    pdu->hdr->hlen);
726 		nvmf_tcp_free_pdu(pdu);
727 		return (EBADMSG);
728 	}
729 
730 	cb->data_xfered += data_len;
731 	data_offset -= cb->data_offset;
732 	if (cb->data_xfered == cb->data_len) {
733 		nvmf_tcp_send_next_r2t(qp, cb);
734 	} else {
735 		tcp_hold_command_buffer(cb);
736 		mtx_unlock(&qp->rx_buffers.lock);
737 	}
738 
739 	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
740 
741 	tcp_release_command_buffer(cb);
742 	nvmf_tcp_free_pdu(pdu);
743 	return (0);
744 }
745 
746 static int
747 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
748 {
749 	const struct nvme_tcp_c2h_data_hdr *c2h;
750 	struct nvmf_tcp_command_buffer *cb;
751 	uint32_t data_len, data_offset;
752 
753 	c2h = (const void *)pdu->hdr;
754 
755 	mtx_lock(&qp->rx_buffers.lock);
756 	cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
757 	if (cb == NULL) {
758 		mtx_unlock(&qp->rx_buffers.lock);
759 		/*
760 		 * XXX: Could be PDU sequence error if cccid is for a
761 		 * command that doesn't use a command buffer.
762 		 */
763 		nvmf_tcp_report_error(qp,
764 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
765 		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
766 		    pdu->hdr->hlen);
767 		nvmf_tcp_free_pdu(pdu);
768 		return (EBADMSG);
769 	}
770 
771 	/* For a data digest mismatch, fail the I/O request. */
772 	if (pdu->data_digest_mismatch) {
773 		cb->error = EINTEGRITY;
774 		tcp_remove_command_buffer(&qp->rx_buffers, cb);
775 		mtx_unlock(&qp->rx_buffers.lock);
776 		tcp_release_command_buffer(cb);
777 		nvmf_tcp_free_pdu(pdu);
778 		return (0);
779 	}
780 
781 	data_len = le32toh(c2h->datal);
782 	if (data_len != pdu->data_len) {
783 		mtx_unlock(&qp->rx_buffers.lock);
784 		nvmf_tcp_report_error(qp,
785 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
786 		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
787 		    pdu->hdr->hlen);
788 		nvmf_tcp_free_pdu(pdu);
789 		return (EBADMSG);
790 	}
791 
792 	data_offset = le32toh(c2h->datao);
793 	if (data_offset < cb->data_offset ||
794 	    data_offset + data_len > cb->data_offset + cb->data_len) {
795 		mtx_unlock(&qp->rx_buffers.lock);
796 		nvmf_tcp_report_error(qp,
797 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
798 		    pdu->m, pdu->hdr->hlen);
799 		nvmf_tcp_free_pdu(pdu);
800 		return (EBADMSG);
801 	}
802 
803 	if (data_offset != cb->data_offset + cb->data_xfered) {
804 		mtx_unlock(&qp->rx_buffers.lock);
805 		nvmf_tcp_report_error(qp,
806 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
807 		    pdu->hdr->hlen);
808 		nvmf_tcp_free_pdu(pdu);
809 		return (EBADMSG);
810 	}
811 
812 	if ((cb->data_xfered + data_len == cb->data_len) !=
813 	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
814 		mtx_unlock(&qp->rx_buffers.lock);
815 		nvmf_tcp_report_error(qp,
816 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
817 		    pdu->hdr->hlen);
818 		nvmf_tcp_free_pdu(pdu);
819 		return (EBADMSG);
820 	}
821 
822 	cb->data_xfered += data_len;
823 	data_offset -= cb->data_offset;
824 	if (cb->data_xfered == cb->data_len)
825 		tcp_remove_command_buffer(&qp->rx_buffers, cb);
826 	else
827 		tcp_hold_command_buffer(cb);
828 	mtx_unlock(&qp->rx_buffers.lock);
829 
830 	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
831 
832 	tcp_release_command_buffer(cb);
833 
834 	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
835 		struct nvme_completion cqe;
836 		struct nvmf_capsule *nc;
837 
838 		memset(&cqe, 0, sizeof(cqe));
839 		cqe.cid = c2h->cccid;
840 
841 		nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
842 		nc->nc_sqhd_valid = false;
843 
844 		nvmf_capsule_received(&qp->qp, nc);
845 	}
846 
847 	nvmf_tcp_free_pdu(pdu);
848 	return (0);
849 }
850 
851 /* Called when m_free drops refcount to 0. */
852 static void
853 nvmf_tcp_mbuf_done(struct mbuf *m)
854 {
855 	struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
856 
857 	tcp_free_command_buffer(cb);
858 }
859 
860 static struct mbuf *
861 nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len)
862 {
863 	struct nvmf_tcp_command_buffer *cb = arg;
864 	struct mbuf *m;
865 
866 	m = m_get(how, MT_DATA);
867 	m->m_flags |= M_RDONLY;
868 	m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL);
869 	m->m_len = len;
870 	return (m);
871 }
872 
873 static void
874 nvmf_tcp_free_mext_pg(struct mbuf *m)
875 {
876 	struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
877 
878 	M_ASSERTEXTPG(m);
879 	tcp_release_command_buffer(cb);
880 }
881 
882 static struct mbuf *
883 nvmf_tcp_mext_pg(void *arg, int how)
884 {
885 	struct nvmf_tcp_command_buffer *cb = arg;
886 	struct mbuf *m;
887 
888 	m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY);
889 	m->m_ext.ext_arg1 = cb;
890 	tcp_hold_command_buffer(cb);
891 	return (m);
892 }
893 
894 /*
895  * Return an mbuf chain for a range of data belonging to a command
896  * buffer.
897  *
898  * The mbuf chain uses M_EXT mbufs which hold references on the
899  * command buffer so that it remains "alive" until the data has been
900  * fully transmitted.  If truncate_ok is true, then the mbuf chain
901  * might return a short chain to avoid gratuitously splitting up a
902  * page.
903  */
904 static struct mbuf *
905 nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb,
906     uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
907     bool can_truncate)
908 {
909 	struct mbuf *m;
910 	size_t len;
911 
912 	m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf,
913 	    nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
914 	    can_truncate);
915 	if (actual_len != NULL)
916 		*actual_len = len;
917 	return (m);
918 }
919 
920 /* NB: cid and ttag and little-endian already. */
921 static void
922 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
923     uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
924 {
925 	struct nvme_tcp_h2c_data_hdr h2c;
926 	struct mbuf *top;
927 
928 	memset(&h2c, 0, sizeof(h2c));
929 	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
930 	if (last_pdu)
931 		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
932 	h2c.cccid = cid;
933 	h2c.ttag = ttag;
934 	h2c.datao = htole32(data_offset);
935 	h2c.datal = htole32(len);
936 
937 	top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
938 	nvmf_tcp_write_pdu(qp, top);
939 }
940 
941 static int
942 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
943 {
944 	const struct nvme_tcp_r2t_hdr *r2t;
945 	struct nvmf_tcp_command_buffer *cb;
946 	uint32_t data_len, data_offset;
947 
948 	r2t = (const void *)pdu->hdr;
949 
950 	mtx_lock(&qp->tx_buffers.lock);
951 	cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0);
952 	if (cb == NULL) {
953 		mtx_unlock(&qp->tx_buffers.lock);
954 		nvmf_tcp_report_error(qp,
955 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
956 		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
957 		    pdu->hdr->hlen);
958 		nvmf_tcp_free_pdu(pdu);
959 		return (EBADMSG);
960 	}
961 
962 	data_offset = le32toh(r2t->r2to);
963 	if (data_offset != cb->data_xfered) {
964 		mtx_unlock(&qp->tx_buffers.lock);
965 		nvmf_tcp_report_error(qp,
966 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
967 		    pdu->hdr->hlen);
968 		nvmf_tcp_free_pdu(pdu);
969 		return (EBADMSG);
970 	}
971 
972 	/*
973 	 * XXX: The spec does not specify how to handle R2T tranfers
974 	 * out of range of the original command.
975 	 */
976 	data_len = le32toh(r2t->r2tl);
977 	if (data_offset + data_len > cb->data_len) {
978 		mtx_unlock(&qp->tx_buffers.lock);
979 		nvmf_tcp_report_error(qp,
980 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
981 		    pdu->m, pdu->hdr->hlen);
982 		nvmf_tcp_free_pdu(pdu);
983 		return (EBADMSG);
984 	}
985 
986 	cb->data_xfered += data_len;
987 	if (cb->data_xfered == cb->data_len)
988 		tcp_remove_command_buffer(&qp->tx_buffers, cb);
989 	else
990 		tcp_hold_command_buffer(cb);
991 	mtx_unlock(&qp->tx_buffers.lock);
992 
993 	/*
994 	 * Queue one or more H2C_DATA PDUs containing the requested
995 	 * data.
996 	 */
997 	while (data_len > 0) {
998 		struct mbuf *m;
999 		uint32_t sent, todo;
1000 
1001 		todo = min(data_len, qp->max_tx_data);
1002 		m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent,
1003 		    todo < data_len);
1004 		tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1005 		    sent, sent == data_len);
1006 
1007 		data_offset += sent;
1008 		data_len -= sent;
1009 	}
1010 
1011 	tcp_release_command_buffer(cb);
1012 	nvmf_tcp_free_pdu(pdu);
1013 	return (0);
1014 }
1015 
1016 /*
1017  * A variant of m_pullup that uses M_WAITOK instead of failing.  It
1018  * also doesn't do anything if enough bytes are already present in the
1019  * first mbuf.
1020  */
1021 static struct mbuf *
1022 pullup_pdu_hdr(struct mbuf *m, int len)
1023 {
1024 	struct mbuf *n, *p;
1025 
1026 	KASSERT(len <= MCLBYTES, ("%s: len too large", __func__));
1027 	if (m->m_len >= len)
1028 		return (m);
1029 
1030 	n = m_get2(len, M_WAITOK, MT_DATA, 0);
1031 	n->m_len = len;
1032 	m_copydata(m, 0, len, mtod(n, void *));
1033 
1034 	while (m != NULL && m->m_len <= len) {
1035 		p = m->m_next;
1036 		len -= m->m_len;
1037 		m_free(m);
1038 		m = p;
1039 	}
1040 	if (len > 0) {
1041 		m->m_data += len;
1042 		m->m_len -= len;
1043 	}
1044 	n->m_next = m;
1045 	return (n);
1046 }
1047 
1048 static int
1049 nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp,
1050     const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu)
1051 {
1052 	/* Ensure the PDU header is contiguous. */
1053 	pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen);
1054 	pdu->hdr = mtod(pdu->m, const void *);
1055 
1056 	switch (ch->pdu_type) {
1057 	default:
1058 		__assert_unreachable();
1059 		break;
1060 	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1061 	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1062 		return (nvmf_tcp_handle_term_req(pdu));
1063 	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1064 		return (nvmf_tcp_save_command_capsule(qp, pdu));
1065 	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1066 		return (nvmf_tcp_save_response_capsule(qp, pdu));
1067 	case NVME_TCP_PDU_TYPE_H2C_DATA:
1068 		return (nvmf_tcp_handle_h2c_data(qp, pdu));
1069 	case NVME_TCP_PDU_TYPE_C2H_DATA:
1070 		return (nvmf_tcp_handle_c2h_data(qp, pdu));
1071 	case NVME_TCP_PDU_TYPE_R2T:
1072 		return (nvmf_tcp_handle_r2t(qp, pdu));
1073 	}
1074 }
1075 
1076 static void
1077 nvmf_tcp_receive(void *arg)
1078 {
1079 	struct nvmf_tcp_qpair *qp = arg;
1080 	struct socket *so = qp->so;
1081 	struct nvmf_tcp_rxpdu pdu;
1082 	struct nvme_tcp_common_pdu_hdr ch;
1083 	struct uio uio;
1084 	struct iovec iov[1];
1085 	struct mbuf *m, *n, *tail;
1086 	u_int avail, needed;
1087 	int error, flags, terror;
1088 	bool have_header;
1089 
1090 	m = tail = NULL;
1091 	have_header = false;
1092 	SOCKBUF_LOCK(&so->so_rcv);
1093 	while (!qp->rx_shutdown) {
1094 		/* Wait until there is enough data for the next step. */
1095 		if (so->so_error != 0 || so->so_rerror != 0) {
1096 			if (so->so_error != 0)
1097 				error = so->so_error;
1098 			else
1099 				error = so->so_rerror;
1100 			SOCKBUF_UNLOCK(&so->so_rcv);
1101 		error:
1102 			m_freem(m);
1103 			nvmf_qpair_error(&qp->qp, error);
1104 			SOCKBUF_LOCK(&so->so_rcv);
1105 			while (!qp->rx_shutdown)
1106 				cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1107 			break;
1108 		}
1109 		avail = sbavail(&so->so_rcv);
1110 		if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1111 			if (!have_header && avail == 0)
1112 				error = 0;
1113 			else
1114 				error = ECONNRESET;
1115 			SOCKBUF_UNLOCK(&so->so_rcv);
1116 			goto error;
1117 		}
1118 		if (avail == 0 || (!have_header && avail < sizeof(ch))) {
1119 			cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1120 			continue;
1121 		}
1122 		SOCKBUF_UNLOCK(&so->so_rcv);
1123 
1124 		if (!have_header) {
1125 			KASSERT(m == NULL, ("%s: m != NULL but no header",
1126 			    __func__));
1127 			memset(&uio, 0, sizeof(uio));
1128 			iov[0].iov_base = &ch;
1129 			iov[0].iov_len = sizeof(ch);
1130 			uio.uio_iov = iov;
1131 			uio.uio_iovcnt = 1;
1132 			uio.uio_resid = sizeof(ch);
1133 			uio.uio_segflg = UIO_SYSSPACE;
1134 			uio.uio_rw = UIO_READ;
1135 			flags = MSG_DONTWAIT | MSG_PEEK;
1136 
1137 			error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
1138 			if (error != 0)
1139 				goto error;
1140 			KASSERT(uio.uio_resid == 0, ("%s: short CH read",
1141 			    __func__));
1142 
1143 			have_header = true;
1144 			needed = le32toh(ch.plen);
1145 
1146 			/*
1147 			 * Malformed PDUs will be reported as errors
1148 			 * by nvmf_tcp_validate_pdu.  Just pass along
1149 			 * garbage headers if the lengths mismatch.
1150 			 */
1151 			if (needed < sizeof(ch) || ch.hlen > needed)
1152 				needed = sizeof(ch);
1153 
1154 			memset(&uio, 0, sizeof(uio));
1155 			uio.uio_resid = needed;
1156 		}
1157 
1158 		flags = MSG_DONTWAIT;
1159 		error = soreceive(so, NULL, &uio, &n, NULL, &flags);
1160 		if (error != 0)
1161 			goto error;
1162 
1163 		if (m == NULL)
1164 			m = n;
1165 		else
1166 			tail->m_next = n;
1167 
1168 		if (uio.uio_resid != 0) {
1169 			tail = n;
1170 			while (tail->m_next != NULL)
1171 				tail = tail->m_next;
1172 
1173 			SOCKBUF_LOCK(&so->so_rcv);
1174 			continue;
1175 		}
1176 #ifdef INVARIANTS
1177 		tail = NULL;
1178 #endif
1179 
1180 		pdu.m = m;
1181 		m = NULL;
1182 		pdu.hdr = &ch;
1183 		error = nvmf_tcp_validate_pdu(qp, &pdu);
1184 		if (error != 0)
1185 			m_freem(pdu.m);
1186 		else
1187 			error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu);
1188 		if (error != 0) {
1189 			/*
1190 			 * If we received a termination request, close
1191 			 * the connection immediately.
1192 			 */
1193 			if (error == ECONNRESET)
1194 				goto error;
1195 
1196 			/*
1197 			 * Wait for up to 30 seconds for the socket to
1198 			 * be closed by the other end.
1199 			 */
1200 			SOCKBUF_LOCK(&so->so_rcv);
1201 			if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1202 				terror = cv_timedwait(&qp->rx_cv,
1203 				    SOCKBUF_MTX(&so->so_rcv), 30 * hz);
1204 				if (terror == ETIMEDOUT)
1205 					printf("NVMe/TCP: Timed out after sending terminate request\n");
1206 			}
1207 			SOCKBUF_UNLOCK(&so->so_rcv);
1208 			goto error;
1209 		}
1210 
1211 		have_header = false;
1212 		SOCKBUF_LOCK(&so->so_rcv);
1213 	}
1214 	SOCKBUF_UNLOCK(&so->so_rcv);
1215 	kthread_exit();
1216 }
1217 
1218 static struct mbuf *
1219 tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1220 {
1221 	struct nvmf_capsule *nc = &tc->nc;
1222 	struct nvmf_tcp_command_buffer *cb;
1223 	struct nvme_sgl_descriptor *sgl;
1224 	struct nvme_tcp_cmd cmd;
1225 	struct mbuf *top, *m;
1226 	bool use_icd;
1227 
1228 	use_icd = false;
1229 	cb = NULL;
1230 	m = NULL;
1231 
1232 	if (nc->nc_data.io_len != 0) {
1233 		cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0,
1234 		    nc->nc_data.io_len, nc->nc_sqe.cid);
1235 
1236 		if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
1237 			use_icd = true;
1238 			m = nvmf_tcp_command_buffer_mbuf(cb, 0,
1239 			    nc->nc_data.io_len, NULL, false);
1240 			cb->data_xfered = nc->nc_data.io_len;
1241 			tcp_release_command_buffer(cb);
1242 		} else if (nc->nc_send_data) {
1243 			mtx_lock(&qp->tx_buffers.lock);
1244 			tcp_add_command_buffer(&qp->tx_buffers, cb);
1245 			mtx_unlock(&qp->tx_buffers.lock);
1246 		} else {
1247 			mtx_lock(&qp->rx_buffers.lock);
1248 			tcp_add_command_buffer(&qp->rx_buffers, cb);
1249 			mtx_unlock(&qp->rx_buffers.lock);
1250 		}
1251 	}
1252 
1253 	memset(&cmd, 0, sizeof(cmd));
1254 	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1255 	cmd.ccsqe = nc->nc_sqe;
1256 
1257 	/* Populate SGL in SQE. */
1258 	sgl = &cmd.ccsqe.sgl;
1259 	memset(sgl, 0, sizeof(*sgl));
1260 	sgl->address = 0;
1261 	sgl->length = htole32(nc->nc_data.io_len);
1262 	if (use_icd) {
1263 		/* Use in-capsule data. */
1264 		sgl->type = NVME_SGL_TYPE_ICD;
1265 	} else {
1266 		/* Use a command buffer. */
1267 		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1268 	}
1269 
1270 	top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
1271 	    nc->nc_data.io_len : 0);
1272 	return (top);
1273 }
1274 
1275 static struct mbuf *
1276 tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1277 {
1278 	struct nvmf_capsule *nc = &tc->nc;
1279 	struct nvme_tcp_rsp rsp;
1280 
1281 	memset(&rsp, 0, sizeof(rsp));
1282 	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1283 	rsp.rccqe = nc->nc_cqe;
1284 
1285 	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1286 }
1287 
1288 static struct mbuf *
1289 capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1290 {
1291 	if (tc->nc.nc_qe_len == sizeof(struct nvme_command))
1292 		return (tcp_command_pdu(qp, tc));
1293 	else
1294 		return (tcp_response_pdu(qp, tc));
1295 }
1296 
1297 static void
1298 nvmf_tcp_send(void *arg)
1299 {
1300 	struct nvmf_tcp_qpair *qp = arg;
1301 	struct nvmf_tcp_capsule *tc;
1302 	struct socket *so = qp->so;
1303 	struct mbuf *m, *n, *p;
1304 	u_long space, tosend;
1305 	int error;
1306 
1307 	m = NULL;
1308 	SOCKBUF_LOCK(&so->so_snd);
1309 	while (!qp->tx_shutdown) {
1310 		if (so->so_error != 0) {
1311 			error = so->so_error;
1312 			SOCKBUF_UNLOCK(&so->so_snd);
1313 		error:
1314 			m_freem(m);
1315 			nvmf_qpair_error(&qp->qp, error);
1316 			SOCKBUF_LOCK(&so->so_snd);
1317 			while (!qp->tx_shutdown)
1318 				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1319 			break;
1320 		}
1321 
1322 		if (m == NULL) {
1323 			/* Next PDU to send. */
1324 			m = mbufq_dequeue(&qp->tx_pdus);
1325 		}
1326 		if (m == NULL) {
1327 			if (STAILQ_EMPTY(&qp->tx_capsules)) {
1328 				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1329 				continue;
1330 			}
1331 
1332 			/* Convert a capsule into a PDU. */
1333 			tc = STAILQ_FIRST(&qp->tx_capsules);
1334 			STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
1335 			SOCKBUF_UNLOCK(&so->so_snd);
1336 
1337 			n = capsule_to_pdu(qp, tc);
1338 			tcp_release_capsule(tc);
1339 
1340 			SOCKBUF_LOCK(&so->so_snd);
1341 			mbufq_enqueue(&qp->tx_pdus, n);
1342 			continue;
1343 		}
1344 
1345 		/*
1346 		 * Wait until there is enough room to send some data.
1347 		 * If the socket buffer is empty, always send at least
1348 		 * something.
1349 		 */
1350 		space = sbspace(&so->so_snd);
1351 		if (space < m->m_len && sbused(&so->so_snd) != 0) {
1352 			cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1353 			continue;
1354 		}
1355 		SOCKBUF_UNLOCK(&so->so_snd);
1356 
1357 		/*
1358 		 * If 'm' is too big, then the socket buffer must be
1359 		 * empty.  Split 'm' to make at least some forward
1360 		 * progress.
1361 		 *
1362 		 * Otherwise, chain up as many pending mbufs from 'm'
1363 		 * that will fit.
1364 		 */
1365 		if (m->m_len > space) {
1366 			n = m_split(m, space, M_WAITOK);
1367 		} else {
1368 			tosend = m->m_len;
1369 			n = m->m_next;
1370 			p = m;
1371 			while (n != NULL && tosend + n->m_len <= space) {
1372 				tosend += n->m_len;
1373 				p = n;
1374 				n = n->m_next;
1375 			}
1376 			KASSERT(p->m_next == n, ("%s: p not before n",
1377 			    __func__));
1378 			p->m_next = NULL;
1379 
1380 			KASSERT(m_length(m, NULL) == tosend,
1381 			    ("%s: length mismatch", __func__));
1382 		}
1383 		error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL);
1384 		if (error != 0) {
1385 			m = NULL;
1386 			m_freem(n);
1387 			goto error;
1388 		}
1389 		m = n;
1390 		SOCKBUF_LOCK(&so->so_snd);
1391 	}
1392 	SOCKBUF_UNLOCK(&so->so_snd);
1393 	kthread_exit();
1394 }
1395 
1396 static int
1397 nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag)
1398 {
1399 	struct nvmf_tcp_qpair *qp = arg;
1400 
1401 	if (soreadable(so))
1402 		cv_signal(&qp->rx_cv);
1403 	return (SU_OK);
1404 }
1405 
1406 static int
1407 nvmf_soupcall_send(struct socket *so, void *arg, int waitflag)
1408 {
1409 	struct nvmf_tcp_qpair *qp = arg;
1410 
1411 	if (sowriteable(so))
1412 		cv_signal(&qp->tx_cv);
1413 	return (SU_OK);
1414 }
1415 
1416 static struct nvmf_qpair *
1417 tcp_allocate_qpair(bool controller, const nvlist_t *nvl)
1418 {
1419 	struct nvmf_tcp_qpair *qp;
1420 	struct socket *so;
1421 	struct file *fp;
1422 	cap_rights_t rights;
1423 	int error;
1424 
1425 	if (!nvlist_exists_number(nvl, "fd") ||
1426 	    !nvlist_exists_number(nvl, "rxpda") ||
1427 	    !nvlist_exists_number(nvl, "txpda") ||
1428 	    !nvlist_exists_bool(nvl, "header_digests") ||
1429 	    !nvlist_exists_bool(nvl, "data_digests") ||
1430 	    !nvlist_exists_number(nvl, "maxr2t") ||
1431 	    !nvlist_exists_number(nvl, "maxh2cdata") ||
1432 	    !nvlist_exists_number(nvl, "max_icd"))
1433 		return (NULL);
1434 
1435 	error = fget(curthread, nvlist_get_number(nvl, "fd"),
1436 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
1437 	if (error != 0)
1438 		return (NULL);
1439 	if (fp->f_type != DTYPE_SOCKET) {
1440 		fdrop(fp, curthread);
1441 		return (NULL);
1442 	}
1443 	so = fp->f_data;
1444 	if (so->so_type != SOCK_STREAM ||
1445 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
1446 		fdrop(fp, curthread);
1447 		return (NULL);
1448 	}
1449 
1450 	/* Claim socket from file descriptor. */
1451 	fp->f_ops = &badfileops;
1452 	fp->f_data = NULL;
1453 	fdrop(fp, curthread);
1454 
1455 	qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO);
1456 	qp->so = so;
1457 	refcount_init(&qp->refs, 1);
1458 	qp->txpda = nvlist_get_number(nvl, "txpda");
1459 	qp->rxpda = nvlist_get_number(nvl, "rxpda");
1460 	qp->header_digests = nvlist_get_bool(nvl, "header_digests");
1461 	qp->data_digests = nvlist_get_bool(nvl, "data_digests");
1462 	qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
1463 	if (controller)
1464 		qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
1465 	qp->max_tx_data = tcp_max_transmit_data;
1466 	if (!controller) {
1467 		qp->max_tx_data = min(qp->max_tx_data,
1468 		    nvlist_get_number(nvl, "maxh2cdata"));
1469 		qp->max_icd = nvlist_get_number(nvl, "max_icd");
1470 	}
1471 
1472 	if (controller) {
1473 		/* Use the SUCCESS flag if SQ flow control is disabled. */
1474 		qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
1475 
1476 		/* NB: maxr2t is 0's based. */
1477 		qp->num_ttags = MIN((u_int)UINT16_MAX + 1,
1478 		    nvlist_get_number(nvl, "qsize") *
1479 		    ((uint64_t)qp->maxr2t + 1));
1480 		qp->open_ttags = mallocarray(qp->num_ttags,
1481 		    sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO);
1482 	}
1483 
1484 	TAILQ_INIT(&qp->rx_buffers.head);
1485 	TAILQ_INIT(&qp->tx_buffers.head);
1486 	mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF);
1487 	mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF);
1488 
1489 	cv_init(&qp->rx_cv, "-");
1490 	cv_init(&qp->tx_cv, "-");
1491 	mbufq_init(&qp->tx_pdus, 0);
1492 	STAILQ_INIT(&qp->tx_capsules);
1493 
1494 	/* Register socket upcalls. */
1495 	SOCKBUF_LOCK(&so->so_rcv);
1496 	soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp);
1497 	SOCKBUF_UNLOCK(&so->so_rcv);
1498 	SOCKBUF_LOCK(&so->so_snd);
1499 	soupcall_set(so, SO_SND, nvmf_soupcall_send, qp);
1500 	SOCKBUF_UNLOCK(&so->so_snd);
1501 
1502 	/* Spin up kthreads. */
1503 	error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0,
1504 	    "nvmef tcp rx");
1505 	if (error != 0) {
1506 		tcp_free_qpair(&qp->qp);
1507 		return (NULL);
1508 	}
1509 	error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0,
1510 	    "nvmef tcp tx");
1511 	if (error != 0) {
1512 		tcp_free_qpair(&qp->qp);
1513 		return (NULL);
1514 	}
1515 
1516 	return (&qp->qp);
1517 }
1518 
1519 static void
1520 tcp_release_qpair(struct nvmf_tcp_qpair *qp)
1521 {
1522 	if (refcount_release(&qp->refs))
1523 		free(qp, M_NVMF_TCP);
1524 }
1525 
1526 static void
1527 tcp_free_qpair(struct nvmf_qpair *nq)
1528 {
1529 	struct nvmf_tcp_qpair *qp = TQP(nq);
1530 	struct nvmf_tcp_command_buffer *ncb, *cb;
1531 	struct nvmf_tcp_capsule *ntc, *tc;
1532 	struct socket *so = qp->so;
1533 
1534 	/* Shut down kthreads and clear upcalls */
1535 	SOCKBUF_LOCK(&so->so_snd);
1536 	qp->tx_shutdown = true;
1537 	if (qp->tx_thread != NULL) {
1538 		cv_signal(&qp->tx_cv);
1539 		mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
1540 		    "nvtcptx", 0);
1541 	}
1542 	soupcall_clear(so, SO_SND);
1543 	SOCKBUF_UNLOCK(&so->so_snd);
1544 
1545 	SOCKBUF_LOCK(&so->so_rcv);
1546 	qp->rx_shutdown = true;
1547 	if (qp->rx_thread != NULL) {
1548 		cv_signal(&qp->rx_cv);
1549 		mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
1550 		    "nvtcprx", 0);
1551 	}
1552 	soupcall_clear(so, SO_RCV);
1553 	SOCKBUF_UNLOCK(&so->so_rcv);
1554 
1555 	STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) {
1556 		nvmf_abort_capsule_data(&tc->nc, ECONNABORTED);
1557 		tcp_release_capsule(tc);
1558 	}
1559 	mbufq_drain(&qp->tx_pdus);
1560 
1561 	cv_destroy(&qp->tx_cv);
1562 	cv_destroy(&qp->rx_cv);
1563 
1564 	if (qp->open_ttags != NULL) {
1565 		for (u_int i = 0; i < qp->num_ttags; i++) {
1566 			cb = qp->open_ttags[i];
1567 			if (cb != NULL) {
1568 				cb->tc->active_r2ts--;
1569 				cb->error = ECONNABORTED;
1570 				tcp_release_command_buffer(cb);
1571 			}
1572 		}
1573 		free(qp->open_ttags, M_NVMF_TCP);
1574 	}
1575 
1576 	mtx_lock(&qp->rx_buffers.lock);
1577 	TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
1578 		tcp_remove_command_buffer(&qp->rx_buffers, cb);
1579 		mtx_unlock(&qp->rx_buffers.lock);
1580 #ifdef INVARIANTS
1581 		if (cb->tc != NULL)
1582 			cb->tc->pending_r2ts--;
1583 #endif
1584 		cb->error = ECONNABORTED;
1585 		tcp_release_command_buffer(cb);
1586 		mtx_lock(&qp->rx_buffers.lock);
1587 	}
1588 	mtx_destroy(&qp->rx_buffers.lock);
1589 
1590 	mtx_lock(&qp->tx_buffers.lock);
1591 	TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
1592 		tcp_remove_command_buffer(&qp->tx_buffers, cb);
1593 		mtx_unlock(&qp->tx_buffers.lock);
1594 		cb->error = ECONNABORTED;
1595 		tcp_release_command_buffer(cb);
1596 		mtx_lock(&qp->tx_buffers.lock);
1597 	}
1598 	mtx_destroy(&qp->tx_buffers.lock);
1599 
1600 	soclose(so);
1601 
1602 	tcp_release_qpair(qp);
1603 }
1604 
1605 static struct nvmf_capsule *
1606 tcp_allocate_capsule(struct nvmf_qpair *nq, int how)
1607 {
1608 	struct nvmf_tcp_qpair *qp = TQP(nq);
1609 	struct nvmf_tcp_capsule *tc;
1610 
1611 	tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO);
1612 	if (tc == NULL)
1613 		return (NULL);
1614 	refcount_init(&tc->refs, 1);
1615 	refcount_acquire(&qp->refs);
1616 	return (&tc->nc);
1617 }
1618 
1619 static void
1620 tcp_release_capsule(struct nvmf_tcp_capsule *tc)
1621 {
1622 	struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair);
1623 
1624 	if (!refcount_release(&tc->refs))
1625 		return;
1626 
1627 	MPASS(tc->active_r2ts == 0);
1628 	MPASS(tc->pending_r2ts == 0);
1629 
1630 	nvmf_tcp_free_pdu(&tc->rx_pdu);
1631 	free(tc, M_NVMF_TCP);
1632 	tcp_release_qpair(qp);
1633 }
1634 
1635 static void
1636 tcp_free_capsule(struct nvmf_capsule *nc)
1637 {
1638 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1639 
1640 	tcp_release_capsule(tc);
1641 }
1642 
1643 static int
1644 tcp_transmit_capsule(struct nvmf_capsule *nc)
1645 {
1646 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1647 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1648 	struct socket *so = qp->so;
1649 
1650 	refcount_acquire(&tc->refs);
1651 	SOCKBUF_LOCK(&so->so_snd);
1652 	STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link);
1653 	if (sowriteable(so))
1654 		cv_signal(&qp->tx_cv);
1655 	SOCKBUF_UNLOCK(&so->so_snd);
1656 	return (0);
1657 }
1658 
1659 static uint8_t
1660 tcp_validate_command_capsule(struct nvmf_capsule *nc)
1661 {
1662 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1663 	struct nvme_sgl_descriptor *sgl;
1664 
1665 	KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
1666 
1667 	sgl = &nc->nc_sqe.sgl;
1668 	switch (sgl->type) {
1669 	case NVME_SGL_TYPE_ICD:
1670 		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1671 			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1672 			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1673 		}
1674 		break;
1675 	case NVME_SGL_TYPE_COMMAND_BUFFER:
1676 		if (tc->rx_pdu.data_len != 0) {
1677 			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1678 			return (NVME_SC_INVALID_FIELD);
1679 		}
1680 		break;
1681 	default:
1682 		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1683 		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1684 	}
1685 
1686 	if (sgl->address != 0) {
1687 		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1688 		return (NVME_SC_SGL_OFFSET_INVALID);
1689 	}
1690 
1691 	return (NVME_SC_SUCCESS);
1692 }
1693 
1694 static size_t
1695 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1696 {
1697 	MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
1698 	return (le32toh(nc->nc_sqe.sgl.length));
1699 }
1700 
1701 static void
1702 tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
1703     struct nvmf_io_request *io)
1704 {
1705 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1706 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1707 	struct nvmf_tcp_command_buffer *cb;
1708 
1709 	cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len,
1710 	    nc->nc_sqe.cid);
1711 
1712 	cb->tc = tc;
1713 	refcount_acquire(&tc->refs);
1714 
1715 	/*
1716 	 * If this command has too many active R2Ts or there are no
1717 	 * available transfer tags, queue the request for later.
1718 	 *
1719 	 * NB: maxr2t is 0's based.
1720 	 */
1721 	mtx_lock(&qp->rx_buffers.lock);
1722 	if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) {
1723 #ifdef INVARIANTS
1724 		tc->pending_r2ts++;
1725 #endif
1726 		TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
1727 		mtx_unlock(&qp->rx_buffers.lock);
1728 		return;
1729 	}
1730 
1731 	nvmf_tcp_allocate_ttag(qp, cb);
1732 	mtx_unlock(&qp->rx_buffers.lock);
1733 
1734 	tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
1735 }
1736 
1737 static void
1738 tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
1739     struct nvmf_io_request *io)
1740 {
1741 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1742 
1743 	mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset,
1744 	    io->io_len, io, 0);
1745 	nvmf_complete_io_request(io, io->io_len, 0);
1746 }
1747 
1748 static int
1749 tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1750     struct nvmf_io_request *io)
1751 {
1752 	struct nvme_sgl_descriptor *sgl;
1753 	size_t data_len;
1754 
1755 	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1756 	    !nc->nc_qpair->nq_controller)
1757 		return (EINVAL);
1758 
1759 	sgl = &nc->nc_sqe.sgl;
1760 	data_len = le32toh(sgl->length);
1761 	if (data_offset + io->io_len > data_len)
1762 		return (EFBIG);
1763 
1764 	if (sgl->type == NVME_SGL_TYPE_ICD)
1765 		tcp_receive_icd_data(nc, data_offset, io);
1766 	else
1767 		tcp_receive_r2t_data(nc, data_offset, io);
1768 	return (0);
1769 }
1770 
1771 /* NB: cid is little-endian already. */
1772 static void
1773 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset,
1774     struct mbuf *m, size_t len, bool last_pdu, bool success)
1775 {
1776 	struct nvme_tcp_c2h_data_hdr c2h;
1777 	struct mbuf *top;
1778 
1779 	memset(&c2h, 0, sizeof(c2h));
1780 	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1781 	if (last_pdu)
1782 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1783 	if (success)
1784 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1785 	c2h.cccid = cid;
1786 	c2h.datao = htole32(data_offset);
1787 	c2h.datal = htole32(len);
1788 
1789 	top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
1790 	nvmf_tcp_write_pdu(qp, top);
1791 }
1792 
1793 static u_int
1794 tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1795     struct mbuf *m, size_t len)
1796 {
1797 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1798 	struct nvme_sgl_descriptor *sgl;
1799 	uint32_t data_len;
1800 	bool last_pdu, last_xfer;
1801 
1802 	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1803 	    !qp->qp.nq_controller) {
1804 		m_freem(m);
1805 		return (NVME_SC_INVALID_FIELD);
1806 	}
1807 
1808 	sgl = &nc->nc_sqe.sgl;
1809 	data_len = le32toh(sgl->length);
1810 	if (data_offset + len > data_len) {
1811 		m_freem(m);
1812 		return (NVME_SC_INVALID_FIELD);
1813 	}
1814 	last_xfer = (data_offset + len == data_len);
1815 
1816 	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1817 		m_freem(m);
1818 		return (NVME_SC_INVALID_FIELD);
1819 	}
1820 
1821 	KASSERT(data_offset == TCAP(nc)->tx_data_offset,
1822 	    ("%s: starting data_offset %u doesn't match end of previous xfer %u",
1823 	    __func__, data_offset, TCAP(nc)->tx_data_offset));
1824 
1825 	/* Queue one more C2H_DATA PDUs containing the data from 'm'. */
1826 	while (m != NULL) {
1827 		struct mbuf *n;
1828 		uint32_t todo;
1829 
1830 		if (m->m_len > qp->max_tx_data) {
1831 			n = m_split(m, qp->max_tx_data, M_WAITOK);
1832 			todo = m->m_len;
1833 		} else {
1834 			struct mbuf *p;
1835 
1836 			todo = m->m_len;
1837 			p = m;
1838 			n = p->m_next;
1839 			while (n != NULL) {
1840 				if (todo + n->m_len > qp->max_tx_data) {
1841 					p->m_next = NULL;
1842 					break;
1843 				}
1844 				todo += n->m_len;
1845 				p = n;
1846 				n = p->m_next;
1847 			}
1848 			MPASS(m_length(m, NULL) == todo);
1849 		}
1850 
1851 		last_pdu = (n == NULL && last_xfer);
1852 		tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
1853 		    last_pdu, last_pdu && qp->send_success);
1854 
1855 		data_offset += todo;
1856 		data_len -= todo;
1857 		m = n;
1858 	}
1859 	MPASS(data_len == 0);
1860 
1861 #ifdef INVARIANTS
1862 	TCAP(nc)->tx_data_offset = data_offset;
1863 #endif
1864 	if (!last_xfer)
1865 		return (NVMF_MORE);
1866 	else if (qp->send_success)
1867 		return (NVMF_SUCCESS_SENT);
1868 	else
1869 		return (NVME_SC_SUCCESS);
1870 }
1871 
1872 struct nvmf_transport_ops tcp_ops = {
1873 	.allocate_qpair = tcp_allocate_qpair,
1874 	.free_qpair = tcp_free_qpair,
1875 	.allocate_capsule = tcp_allocate_capsule,
1876 	.free_capsule = tcp_free_capsule,
1877 	.transmit_capsule = tcp_transmit_capsule,
1878 	.validate_command_capsule = tcp_validate_command_capsule,
1879 	.capsule_data_len = tcp_capsule_data_len,
1880 	.receive_controller_data = tcp_receive_controller_data,
1881 	.send_controller_data = tcp_send_controller_data,
1882 	.trtype = NVMF_TRTYPE_TCP,
1883 	.priority = 0,
1884 };
1885 
1886 NVMF_TRANSPORT(tcp, tcp_ops);
1887