xref: /freebsd/lib/libnvmf/nvmf_tcp.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/endian.h>
9 #include <sys/gsb_crc32.h>
10 #include <sys/queue.h>
11 #include <sys/uio.h>
12 #include <assert.h>
13 #include <errno.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <unistd.h>
18 
19 #include "libnvmf.h"
20 #include "internal.h"
21 #include "nvmf_tcp.h"
22 
23 struct nvmf_tcp_qpair;
24 
25 struct nvmf_tcp_command_buffer {
26 	struct nvmf_tcp_qpair *qp;
27 
28 	void	*data;
29 	size_t	data_len;
30 	size_t	data_xfered;
31 	uint32_t data_offset;
32 
33 	uint16_t cid;
34 	uint16_t ttag;
35 
36 	LIST_ENTRY(nvmf_tcp_command_buffer) link;
37 };
38 
39 LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
40 
41 struct nvmf_tcp_association {
42 	struct nvmf_association na;
43 
44 	uint32_t ioccsz;
45 };
46 
47 struct nvmf_tcp_rxpdu {
48 	struct nvme_tcp_common_pdu_hdr *hdr;
49 	uint32_t data_len;
50 };
51 
52 struct nvmf_tcp_capsule {
53 	struct nvmf_capsule nc;
54 
55 	struct nvmf_tcp_rxpdu rx_pdu;
56 	struct nvmf_tcp_command_buffer *cb;
57 
58 	TAILQ_ENTRY(nvmf_tcp_capsule) link;
59 };
60 
61 struct nvmf_tcp_qpair {
62 	struct nvmf_qpair qp;
63 	int s;
64 
65 	uint8_t	txpda;
66 	uint8_t rxpda;
67 	bool header_digests;
68 	bool data_digests;
69 	uint32_t maxr2t;
70 	uint32_t maxh2cdata;
71 	uint32_t max_icd;	/* Host only */
72 	uint16_t next_ttag;	/* Controller only */
73 
74 	struct nvmf_tcp_command_buffer_list tx_buffers;
75 	struct nvmf_tcp_command_buffer_list rx_buffers;
76 	TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
77 };
78 
79 #define	TASSOC(nc)	((struct nvmf_tcp_association *)(na))
80 #define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
81 #define	CTCAP(nc)	((const struct nvmf_tcp_capsule *)(nc))
82 #define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
83 
84 static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
85 
86 static uint32_t
87 compute_digest(const void *buf, size_t len)
88 {
89 	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
90 }
91 
92 static struct nvmf_tcp_command_buffer *
93 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
94     uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
95     bool receive)
96 {
97 	struct nvmf_tcp_command_buffer *cb;
98 
99 	cb = malloc(sizeof(*cb));
100 	cb->qp = qp;
101 	cb->data = data;
102 	cb->data_offset = data_offset;
103 	cb->data_len = data_len;
104 	cb->data_xfered = 0;
105 	cb->cid = cid;
106 	cb->ttag = ttag;
107 
108 	if (receive)
109 		LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
110 	else
111 		LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
112 	return (cb);
113 }
114 
115 static struct nvmf_tcp_command_buffer *
116 tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
117     bool receive)
118 {
119 	struct nvmf_tcp_command_buffer_list *list;
120 	struct nvmf_tcp_command_buffer *cb;
121 
122 	list = receive ? &qp->rx_buffers : &qp->tx_buffers;
123 	LIST_FOREACH(cb, list, link) {
124 		if (cb->cid == cid && cb->ttag == ttag)
125 			return (cb);
126 	}
127 	return (NULL);
128 }
129 
130 static void
131 tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
132     bool receive)
133 {
134 	struct nvmf_tcp_command_buffer *cb;
135 
136 	cb = tcp_find_command_buffer(qp, cid, ttag, receive);
137 	if (cb != NULL)
138 		LIST_REMOVE(cb, link);
139 }
140 
141 static void
142 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
143 {
144 	LIST_REMOVE(cb, link);
145 	free(cb);
146 }
147 
148 static int
149 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
150 {
151 	ssize_t nwritten;
152 	const char *cp;
153 
154 	cp = pdu;
155 	while (len != 0) {
156 		nwritten = write(qp->s, cp, len);
157 		if (nwritten < 0)
158 			return (errno);
159 		len -= nwritten;
160 		cp += nwritten;
161 	}
162 	return (0);
163 }
164 
165 static int
166 nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
167     u_int iovcnt, size_t len)
168 {
169 	ssize_t nwritten;
170 
171 	for (;;) {
172 		nwritten = writev(qp->s, iov, iovcnt);
173 		if (nwritten < 0)
174 			return (errno);
175 
176 		len -= nwritten;
177 		if (len == 0)
178 			return (0);
179 
180 		while (iov->iov_len <= (size_t)nwritten) {
181 			nwritten -= iov->iov_len;
182 			iovcnt--;
183 			iov++;
184 		}
185 
186 		iov->iov_base = (char *)iov->iov_base + nwritten;
187 		iov->iov_len -= nwritten;
188 	}
189 }
190 
191 static void
192 nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
193     uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
194 {
195 	struct nvme_tcp_term_req_hdr hdr;
196 	struct iovec iov[2];
197 
198 	if (hlen != 0) {
199 		if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
200 			hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
201 		if (hlen > pdu_len)
202 			hlen = pdu_len;
203 	}
204 
205 	memset(&hdr, 0, sizeof(hdr));
206 	hdr.common.pdu_type = na->na_controller ?
207 	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
208 	hdr.common.hlen = sizeof(hdr);
209 	hdr.common.plen = sizeof(hdr) + hlen;
210 	hdr.fes = htole16(fes);
211 	le32enc(hdr.fei, fei);
212 	iov[0].iov_base = &hdr;
213 	iov[0].iov_len = sizeof(hdr);
214 	iov[1].iov_base = __DECONST(void *, rx_pdu);
215 	iov[1].iov_len = hlen;
216 
217 	(void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
218 	close(qp->s);
219 	qp->s = -1;
220 }
221 
222 static int
223 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
224     size_t pdu_len)
225 {
226 	const struct nvme_tcp_common_pdu_hdr *ch;
227 	uint32_t data_len, fei, plen;
228 	uint32_t digest, rx_digest;
229 	u_int hlen;
230 	int error;
231 	uint16_t fes;
232 
233 	/* Determine how large of a PDU header to return for errors. */
234 	ch = pdu->hdr;
235 	hlen = ch->hlen;
236 	plen = le32toh(ch->plen);
237 	if (hlen < sizeof(*ch) || hlen > plen)
238 		hlen = sizeof(*ch);
239 
240 	error = nvmf_tcp_validate_pdu_header(ch,
241 	    qp->qp.nq_association->na_controller, qp->header_digests,
242 	    qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
243 	if (error != 0) {
244 		if (error == ECONNRESET) {
245 			close(qp->s);
246 			qp->s = -1;
247 		} else {
248 			nvmf_tcp_report_error(qp->qp.nq_association, qp,
249 			    fes, fei, ch, pdu_len, hlen);
250 		}
251 		return (error);
252 	}
253 
254 	/* Check header digest if present. */
255 	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
256 		digest = compute_digest(ch, ch->hlen);
257 		memcpy(&rx_digest, (const char *)ch + ch->hlen,
258 		    sizeof(rx_digest));
259 		if (digest != rx_digest) {
260 			printf("NVMe/TCP: Header digest mismatch\n");
261 			nvmf_tcp_report_error(qp->qp.nq_association, qp,
262 			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
263 			    pdu_len, hlen);
264 			return (EBADMSG);
265 		}
266 	}
267 
268 	/* Check data digest if present. */
269 	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
270 		digest = compute_digest((const char *)ch + ch->pdo, data_len);
271 		memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
272 		    sizeof(rx_digest));
273 		if (digest != rx_digest) {
274 			printf("NVMe/TCP: Data digest mismatch\n");
275 			return (EBADMSG);
276 		}
277 	}
278 
279 	pdu->data_len = data_len;
280 	return (0);
281 }
282 
283 /*
284  * Read data from a socket, retrying until the data has been fully
285  * read or an error occurs.
286  */
287 static int
288 nvmf_tcp_read_buffer(int s, void *buf, size_t len)
289 {
290 	ssize_t nread;
291 	char *cp;
292 
293 	cp = buf;
294 	while (len != 0) {
295 		nread = read(s, cp, len);
296 		if (nread < 0)
297 			return (errno);
298 		if (nread == 0)
299 			return (ECONNRESET);
300 		len -= nread;
301 		cp += nread;
302 	}
303 	return (0);
304 }
305 
306 static int
307 nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
308 {
309 	struct nvme_tcp_common_pdu_hdr ch;
310 	uint32_t plen;
311 	int error;
312 
313 	memset(pdu, 0, sizeof(*pdu));
314 	error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
315 	if (error != 0)
316 		return (error);
317 
318 	plen = le32toh(ch.plen);
319 
320 	/*
321 	 * Validate a header with garbage lengths to trigger
322 	 * an error message without reading more.
323 	 */
324 	if (plen < sizeof(ch) || ch.hlen > plen) {
325 		pdu->hdr = &ch;
326 		error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
327 		pdu->hdr = NULL;
328 		assert(error != 0);
329 		return (error);
330 	}
331 
332 	/* Read the rest of the PDU. */
333 	pdu->hdr = malloc(plen);
334 	memcpy(pdu->hdr, &ch, sizeof(ch));
335 	error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
336 	if (error != 0)
337 		return (error);
338 	error = nvmf_tcp_validate_pdu(qp, pdu, plen);
339 	if (error != 0) {
340 		free(pdu->hdr);
341 		pdu->hdr = NULL;
342 	}
343 	return (error);
344 }
345 
346 static void
347 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
348 {
349 	free(pdu->hdr);
350 	pdu->hdr = NULL;
351 }
352 
353 static int
354 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
355 {
356 	struct nvme_tcp_term_req_hdr *hdr;
357 
358 	hdr = (void *)pdu->hdr;
359 
360 	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
361 	    le16toh(hdr->fes), le32dec(hdr->fei));
362 	nvmf_tcp_free_pdu(pdu);
363 	return (ECONNRESET);
364 }
365 
366 static int
367 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
368     struct nvmf_tcp_rxpdu *pdu)
369 {
370 	struct nvme_tcp_cmd *cmd;
371 	struct nvmf_capsule *nc;
372 	struct nvmf_tcp_capsule *tc;
373 
374 	cmd = (void *)pdu->hdr;
375 
376 	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
377 	if (nc == NULL)
378 		return (ENOMEM);
379 
380 	tc = TCAP(nc);
381 	tc->rx_pdu = *pdu;
382 
383 	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
384 	return (0);
385 }
386 
387 static int
388 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
389     struct nvmf_tcp_rxpdu *pdu)
390 {
391 	struct nvme_tcp_rsp *rsp;
392 	struct nvmf_capsule *nc;
393 	struct nvmf_tcp_capsule *tc;
394 
395 	rsp = (void *)pdu->hdr;
396 
397 	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
398 	if (nc == NULL)
399 		return (ENOMEM);
400 
401 	nc->nc_sqhd_valid = true;
402 	tc = TCAP(nc);
403 	tc->rx_pdu = *pdu;
404 
405 	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
406 
407 	/*
408 	 * Once the CQE has been received, no further transfers to the
409 	 * command buffer for the associated CID can occur.
410 	 */
411 	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
412 	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
413 
414 	return (0);
415 }
416 
417 /*
418  * Construct and send a PDU that contains an optional data payload.
419  * This includes dealing with digests and the length fields in the
420  * common header.
421  */
422 static int
423 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
424     void *data, uint32_t data_len)
425 {
426 	struct nvme_tcp_common_pdu_hdr *ch;
427 	struct iovec iov[5];
428 	u_int iovcnt;
429 	uint32_t header_digest, data_digest, pad, pdo, plen;
430 
431 	plen = hlen;
432 	if (qp->header_digests)
433 		plen += sizeof(header_digest);
434 	if (data_len != 0) {
435 		pdo = roundup(plen, qp->txpda);
436 		pad = pdo - plen;
437 		plen = pdo + data_len;
438 		if (qp->data_digests)
439 			plen += sizeof(data_digest);
440 	} else {
441 		assert(data == NULL);
442 		pdo = 0;
443 		pad = 0;
444 	}
445 
446 	ch = hdr;
447 	ch->hlen = hlen;
448 	if (qp->header_digests)
449 		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
450 	if (qp->data_digests && data_len != 0)
451 		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
452 	ch->pdo = pdo;
453 	ch->plen = htole32(plen);
454 
455 	/* CH + PSH */
456 	iov[0].iov_base = hdr;
457 	iov[0].iov_len = hlen;
458 	iovcnt = 1;
459 
460 	/* HDGST */
461 	if (qp->header_digests) {
462 		header_digest = compute_digest(hdr, hlen);
463 		iov[iovcnt].iov_base = &header_digest;
464 		iov[iovcnt].iov_len = sizeof(header_digest);
465 		iovcnt++;
466 	}
467 
468 	if (pad != 0) {
469 		/* PAD */
470 		iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
471 		iov[iovcnt].iov_len = pad;
472 		iovcnt++;
473 	}
474 
475 	if (data_len != 0) {
476 		/* DATA */
477 		iov[iovcnt].iov_base = data;
478 		iov[iovcnt].iov_len = data_len;
479 		iovcnt++;
480 
481 		/* DDGST */
482 		if (qp->data_digests) {
483 			data_digest = compute_digest(data, data_len);
484 			iov[iovcnt].iov_base = &data_digest;
485 			iov[iovcnt].iov_len = sizeof(data_digest);
486 			iovcnt++;
487 		}
488 	}
489 
490 	return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
491 }
492 
493 static int
494 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
495 {
496 	struct nvme_tcp_h2c_data_hdr *h2c;
497 	struct nvmf_tcp_command_buffer *cb;
498 	uint32_t data_len, data_offset;
499 	const char *icd;
500 
501 	h2c = (void *)pdu->hdr;
502 	if (le32toh(h2c->datal) > qp->maxh2cdata) {
503 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
504 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
505 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
506 		nvmf_tcp_free_pdu(pdu);
507 		return (EBADMSG);
508 	}
509 
510 	cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
511 	if (cb == NULL) {
512 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
513 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
514 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
515 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
516 		nvmf_tcp_free_pdu(pdu);
517 		return (EBADMSG);
518 	}
519 
520 	data_len = le32toh(h2c->datal);
521 	if (data_len != pdu->data_len) {
522 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
523 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
524 		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
525 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
526 		nvmf_tcp_free_pdu(pdu);
527 		return (EBADMSG);
528 	}
529 
530 	data_offset = le32toh(h2c->datao);
531 	if (data_offset < cb->data_offset ||
532 	    data_offset + data_len > cb->data_offset + cb->data_len) {
533 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
534 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
535 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
536 		nvmf_tcp_free_pdu(pdu);
537 		return (EBADMSG);
538 	}
539 
540 	if (data_offset != cb->data_offset + cb->data_xfered) {
541 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
542 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
543 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
544 		nvmf_tcp_free_pdu(pdu);
545 		return (EBADMSG);
546 	}
547 
548 	if ((cb->data_xfered + data_len == cb->data_len) !=
549 	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
550 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
551 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
552 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
553 		nvmf_tcp_free_pdu(pdu);
554 		return (EBADMSG);
555 	}
556 
557 	cb->data_xfered += data_len;
558 	data_offset -= cb->data_offset;
559 	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
560 	memcpy((char *)cb->data + data_offset, icd, data_len);
561 
562 	nvmf_tcp_free_pdu(pdu);
563 	return (0);
564 }
565 
566 static int
567 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
568 {
569 	struct nvme_tcp_c2h_data_hdr *c2h;
570 	struct nvmf_tcp_command_buffer *cb;
571 	uint32_t data_len, data_offset;
572 	const char *icd;
573 
574 	c2h = (void *)pdu->hdr;
575 
576 	cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
577 	if (cb == NULL) {
578 		/*
579 		 * XXX: Could be PDU sequence error if cccid is for a
580 		 * command that doesn't use a command buffer.
581 		 */
582 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
583 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
584 		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
585 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
586 		nvmf_tcp_free_pdu(pdu);
587 		return (EBADMSG);
588 	}
589 
590 	data_len = le32toh(c2h->datal);
591 	if (data_len != pdu->data_len) {
592 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
593 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
594 		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
595 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
596 		nvmf_tcp_free_pdu(pdu);
597 		return (EBADMSG);
598 	}
599 
600 	data_offset = le32toh(c2h->datao);
601 	if (data_offset < cb->data_offset ||
602 	    data_offset + data_len > cb->data_offset + cb->data_len) {
603 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
604 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
605 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
606 		nvmf_tcp_free_pdu(pdu);
607 		return (EBADMSG);
608 	}
609 
610 	if (data_offset != cb->data_offset + cb->data_xfered) {
611 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
612 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
613 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
614 		nvmf_tcp_free_pdu(pdu);
615 		return (EBADMSG);
616 	}
617 
618 	if ((cb->data_xfered + data_len == cb->data_len) !=
619 	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
620 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
621 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
622 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
623 		nvmf_tcp_free_pdu(pdu);
624 		return (EBADMSG);
625 	}
626 
627 	cb->data_xfered += data_len;
628 	data_offset -= cb->data_offset;
629 	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
630 	memcpy((char *)cb->data + data_offset, icd, data_len);
631 
632 	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
633 		struct nvme_completion cqe;
634 		struct nvmf_tcp_capsule *tc;
635 		struct nvmf_capsule *nc;
636 
637 		memset(&cqe, 0, sizeof(cqe));
638 		cqe.cid = cb->cid;
639 
640 		nc = nvmf_allocate_response(&qp->qp, &cqe);
641 		if (nc == NULL) {
642 			nvmf_tcp_free_pdu(pdu);
643 			return (ENOMEM);
644 		}
645 		nc->nc_sqhd_valid = false;
646 
647 		tc = TCAP(nc);
648 		TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
649 	}
650 
651 	nvmf_tcp_free_pdu(pdu);
652 	return (0);
653 }
654 
655 /* NB: cid and ttag and little-endian already. */
656 static int
657 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
658     uint32_t data_offset, void *buf, size_t len, bool last_pdu)
659 {
660 	struct nvme_tcp_h2c_data_hdr h2c;
661 
662 	memset(&h2c, 0, sizeof(h2c));
663 	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
664 	if (last_pdu)
665 		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
666 	h2c.cccid = cid;
667 	h2c.ttag = ttag;
668 	h2c.datao = htole32(data_offset);
669 	h2c.datal = htole32(len);
670 
671 	return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
672 }
673 
674 /* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
675 static int
676 tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
677     uint32_t data_offset, void *buf, size_t len, bool last_pdu)
678 {
679 	char *p;
680 
681 	p = buf;
682 	while (len != 0) {
683 		size_t todo;
684 		int error;
685 
686 		todo = len;
687 		if (todo > qp->maxh2cdata)
688 			todo = qp->maxh2cdata;
689 		error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
690 		    last_pdu && todo == len);
691 		if (error != 0)
692 			return (error);
693 		p += todo;
694 		len -= todo;
695 	}
696 	return (0);
697 }
698 
699 static int
700 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
701 {
702 	struct nvmf_tcp_command_buffer *cb;
703 	struct nvme_tcp_r2t_hdr *r2t;
704 	uint32_t data_len, data_offset;
705 	int error;
706 
707 	r2t = (void *)pdu->hdr;
708 
709 	cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
710 	if (cb == NULL) {
711 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
712 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
713 		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
714 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
715 		nvmf_tcp_free_pdu(pdu);
716 		return (EBADMSG);
717 	}
718 
719 	data_offset = le32toh(r2t->r2to);
720 	if (data_offset != cb->data_xfered) {
721 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
722 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
723 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
724 		nvmf_tcp_free_pdu(pdu);
725 		return (EBADMSG);
726 	}
727 
728 	/*
729 	 * XXX: The spec does not specify how to handle R2T tranfers
730 	 * out of range of the original command.
731 	 */
732 	data_len = le32toh(r2t->r2tl);
733 	if (data_offset + data_len > cb->data_len) {
734 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
735 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
736 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
737 		nvmf_tcp_free_pdu(pdu);
738 		return (EBADMSG);
739 	}
740 
741 	cb->data_xfered += data_len;
742 
743 	/*
744 	 * Write out one or more H2C_DATA PDUs containing the
745 	 * requested data.
746 	 */
747 	error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
748 	    data_offset, (char *)cb->data + data_offset, data_len, true);
749 
750 	nvmf_tcp_free_pdu(pdu);
751 	return (error);
752 }
753 
754 static int
755 nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
756 {
757 	struct nvmf_tcp_rxpdu pdu;
758 	int error;
759 
760 	error = nvmf_tcp_read_pdu(qp, &pdu);
761 	if (error != 0)
762 		return (error);
763 
764 	switch (pdu.hdr->pdu_type) {
765 	default:
766 		__unreachable();
767 		break;
768 	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
769 	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
770 		return (nvmf_tcp_handle_term_req(&pdu));
771 	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
772 		return (nvmf_tcp_save_command_capsule(qp, &pdu));
773 	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
774 		return (nvmf_tcp_save_response_capsule(qp, &pdu));
775 	case NVME_TCP_PDU_TYPE_H2C_DATA:
776 		return (nvmf_tcp_handle_h2c_data(qp, &pdu));
777 	case NVME_TCP_PDU_TYPE_C2H_DATA:
778 		return (nvmf_tcp_handle_c2h_data(qp, &pdu));
779 	case NVME_TCP_PDU_TYPE_R2T:
780 		return (nvmf_tcp_handle_r2t(qp, &pdu));
781 	}
782 }
783 
784 static bool
785 nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
786     const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
787 {
788 	const struct nvme_tcp_ic_req *pdu;
789 	uint32_t plen;
790 	u_int hlen;
791 
792 	/* Determine how large of a PDU header to return for errors. */
793 	hlen = ch->hlen;
794 	plen = le32toh(ch->plen);
795 	if (hlen < sizeof(*ch) || hlen > plen)
796 		hlen = sizeof(*ch);
797 
798 	/*
799 	 * Errors must be reported for the lowest incorrect field
800 	 * first, so validate fields in order.
801 	 */
802 
803 	/* Validate pdu_type. */
804 
805 	/* Controllers only receive PDUs with a PDU direction of 0. */
806 	if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) {
807 		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
808 		nvmf_tcp_report_error(na, qp,
809 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
810 		    hlen);
811 		return (false);
812 	}
813 
814 	switch (ch->pdu_type) {
815 	case NVME_TCP_PDU_TYPE_IC_REQ:
816 	case NVME_TCP_PDU_TYPE_IC_RESP:
817 		break;
818 	default:
819 		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
820 		nvmf_tcp_report_error(na, qp,
821 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
822 		    hlen);
823 		return (false);
824 	}
825 
826 	/* Validate flags. */
827 	if (ch->flags != 0) {
828 		na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
829 		    ch->flags);
830 		nvmf_tcp_report_error(na, qp,
831 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
832 		    hlen);
833 		return (false);
834 	}
835 
836 	/* Validate hlen. */
837 	if (ch->hlen != 128) {
838 		na_error(na, "NVMe/TCP: Invalid PDU header length %u",
839 		    ch->hlen);
840 		nvmf_tcp_report_error(na, qp,
841 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
842 		    hlen);
843 		return (false);
844 	}
845 
846 	/* Validate pdo. */
847 	if (ch->pdo != 0) {
848 		na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
849 		nvmf_tcp_report_error(na, qp,
850 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
851 		    hlen);
852 		return (false);
853 	}
854 
855 	/* Validate plen. */
856 	if (plen != 128) {
857 		na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
858 		nvmf_tcp_report_error(na, qp,
859 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
860 		    hlen);
861 		return (false);
862 	}
863 
864 	/* Validate fields common to both ICReq and ICResp. */
865 	pdu = (const struct nvme_tcp_ic_req *)ch;
866 	if (le16toh(pdu->pfv) != 0) {
867 		na_error(na, "NVMe/TCP: Unsupported PDU version %u",
868 		    le16toh(pdu->pfv));
869 		nvmf_tcp_report_error(na, qp,
870 		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
871 		    8, ch, pdu_len, hlen);
872 		return (false);
873 	}
874 
875 	if (pdu->hpda > NVME_TCP_HPDA_MAX) {
876 		na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
877 		nvmf_tcp_report_error(na, qp,
878 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
879 		    hlen);
880 		return (false);
881 	}
882 
883 	if (pdu->dgst.bits.reserved != 0) {
884 		na_error(na, "NVMe/TCP: Invalid digest settings");
885 		nvmf_tcp_report_error(na, qp,
886 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
887 		    hlen);
888 		return (false);
889 	}
890 
891 	return (true);
892 }
893 
894 static bool
895 nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
896     struct nvme_tcp_ic_req *pdu)
897 {
898 	int error;
899 
900 	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
901 	if (error != 0) {
902 		na_error(na, "NVMe/TCP: Failed to read IC request: %s",
903 		    strerror(error));
904 		return (false);
905 	}
906 
907 	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
908 }
909 
910 static bool
911 nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
912     struct nvme_tcp_ic_resp *pdu)
913 {
914 	int error;
915 
916 	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
917 	if (error != 0) {
918 		na_error(na, "NVMe/TCP: Failed to read IC response: %s",
919 		    strerror(error));
920 		return (false);
921 	}
922 
923 	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
924 }
925 
926 static struct nvmf_association *
927 tcp_allocate_association(bool controller,
928     const struct nvmf_association_params *params)
929 {
930 	struct nvmf_tcp_association *ta;
931 
932 	if (controller) {
933 		/* 7.4.10.3 */
934 		if (params->tcp.maxh2cdata < 4096 ||
935 		    params->tcp.maxh2cdata % 4 != 0)
936 			return (NULL);
937 	}
938 
939 	ta = calloc(1, sizeof(*ta));
940 
941 	return (&ta->na);
942 }
943 
944 static void
945 tcp_update_association(struct nvmf_association *na,
946     const struct nvme_controller_data *cdata)
947 {
948 	struct nvmf_tcp_association *ta = TASSOC(na);
949 
950 	ta->ioccsz = le32toh(cdata->ioccsz);
951 }
952 
953 static void
954 tcp_free_association(struct nvmf_association *na)
955 {
956 	free(na);
957 }
958 
959 static bool
960 tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
961 {
962 	const struct nvmf_association_params *params = &na->na_params;
963 	struct nvmf_tcp_association *ta = TASSOC(na);
964 	struct nvme_tcp_ic_req ic_req;
965 	struct nvme_tcp_ic_resp ic_resp;
966 	uint32_t maxh2cdata;
967 	int error;
968 
969 	if (!admin) {
970 		if (ta->ioccsz == 0) {
971 			na_error(na, "TCP I/O queues require cdata");
972 			return (false);
973 		}
974 		if (ta->ioccsz < 4) {
975 			na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
976 			return (false);
977 		}
978 	}
979 
980 	memset(&ic_req, 0, sizeof(ic_req));
981 	ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
982 	ic_req.common.hlen = sizeof(ic_req);
983 	ic_req.common.plen = htole32(sizeof(ic_req));
984 	ic_req.pfv = htole16(0);
985 	ic_req.hpda = params->tcp.pda;
986 	if (params->tcp.header_digests)
987 		ic_req.dgst.bits.hdgst_enable = 1;
988 	if (params->tcp.data_digests)
989 		ic_req.dgst.bits.ddgst_enable = 1;
990 	ic_req.maxr2t = htole32(params->tcp.maxr2t);
991 
992 	error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
993 	if (error != 0) {
994 		na_error(na, "Failed to write IC request: %s", strerror(error));
995 		return (false);
996 	}
997 
998 	if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
999 		return (false);
1000 
1001 	/* Ensure the controller didn't enable digests we didn't request. */
1002 	if ((!params->tcp.header_digests &&
1003 	    ic_resp.dgst.bits.hdgst_enable != 0) ||
1004 	    (!params->tcp.data_digests &&
1005 	    ic_resp.dgst.bits.ddgst_enable != 0)) {
1006 		na_error(na, "Controller enabled unrequested digests");
1007 		nvmf_tcp_report_error(na, qp,
1008 		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
1009 		    11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
1010 		return (false);
1011 	}
1012 
1013 	/*
1014 	 * XXX: Is there an upper-bound to enforce here?  Perhaps pick
1015 	 * some large value and report larger values as an unsupported
1016 	 * parameter?
1017 	 */
1018 	maxh2cdata = le32toh(ic_resp.maxh2cdata);
1019 	if (maxh2cdata < 4096 || maxh2cdata % 4 != 0) {
1020 		na_error(na, "Invalid MAXH2CDATA %u", maxh2cdata);
1021 		nvmf_tcp_report_error(na, qp,
1022 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
1023 		    sizeof(ic_resp), sizeof(ic_resp));
1024 		return (false);
1025 	}
1026 
1027 	qp->rxpda = (params->tcp.pda + 1) * 4;
1028 	qp->txpda = (ic_resp.cpda + 1) * 4;
1029 	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1030 	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1031 	qp->maxr2t = params->tcp.maxr2t;
1032 	qp->maxh2cdata = maxh2cdata;
1033 	if (admin)
1034 		/* 7.4.3 */
1035 		qp->max_icd = 8192;
1036 	else
1037 		qp->max_icd = (ta->ioccsz - 4) * 16;
1038 
1039 	return (0);
1040 }
1041 
1042 static bool
1043 tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
1044 {
1045 	const struct nvmf_association_params *params = &na->na_params;
1046 	struct nvme_tcp_ic_req ic_req;
1047 	struct nvme_tcp_ic_resp ic_resp;
1048 	int error;
1049 
1050 	if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
1051 		return (false);
1052 
1053 	memset(&ic_resp, 0, sizeof(ic_resp));
1054 	ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
1055 	ic_resp.common.hlen = sizeof(ic_req);
1056 	ic_resp.common.plen = htole32(sizeof(ic_req));
1057 	ic_resp.pfv = htole16(0);
1058 	ic_resp.cpda = params->tcp.pda;
1059 	if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
1060 		ic_resp.dgst.bits.hdgst_enable = 1;
1061 	if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
1062 		ic_resp.dgst.bits.ddgst_enable = 1;
1063 	ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
1064 
1065 	error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
1066 	if (error != 0) {
1067 		na_error(na, "Failed to write IC response: %s",
1068 		    strerror(error));
1069 		return (false);
1070 	}
1071 
1072 	qp->rxpda = (params->tcp.pda + 1) * 4;
1073 	qp->txpda = (ic_req.hpda + 1) * 4;
1074 	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1075 	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1076 	qp->maxr2t = le32toh(ic_req.maxr2t);
1077 	qp->maxh2cdata = params->tcp.maxh2cdata;
1078 	qp->max_icd = 0;	/* XXX */
1079 	return (0);
1080 }
1081 
1082 static struct nvmf_qpair *
1083 tcp_allocate_qpair(struct nvmf_association *na,
1084     const struct nvmf_qpair_params *qparams)
1085 {
1086 	const struct nvmf_association_params *aparams = &na->na_params;
1087 	struct nvmf_tcp_qpair *qp;
1088 	int error;
1089 
1090 	if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
1091 		na_error(na, "Invalid PDA");
1092 		return (NULL);
1093 	}
1094 
1095 	qp = calloc(1, sizeof(*qp));
1096 	qp->s = qparams->tcp.fd;
1097 	LIST_INIT(&qp->rx_buffers);
1098 	LIST_INIT(&qp->tx_buffers);
1099 	TAILQ_INIT(&qp->rx_capsules);
1100 	if (na->na_controller)
1101 		error = tcp_accept(qp, na);
1102 	else
1103 		error = tcp_connect(qp, na, qparams->admin);
1104 	if (error != 0) {
1105 		free(qp);
1106 		return (NULL);
1107 	}
1108 
1109 	return (&qp->qp);
1110 }
1111 
1112 static void
1113 tcp_free_qpair(struct nvmf_qpair *nq)
1114 {
1115 	struct nvmf_tcp_qpair *qp = TQP(nq);
1116 	struct nvmf_tcp_capsule *ntc, *tc;
1117 	struct nvmf_tcp_command_buffer *ncb, *cb;
1118 
1119 	TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
1120 		TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1121 		nvmf_free_capsule(&tc->nc);
1122 	}
1123 	LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
1124 		tcp_free_command_buffer(cb);
1125 	}
1126 	LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
1127 		tcp_free_command_buffer(cb);
1128 	}
1129 	free(qp);
1130 }
1131 
1132 static void
1133 tcp_kernel_handoff_params(struct nvmf_qpair *nq, nvlist_t *nvl)
1134 {
1135 	struct nvmf_tcp_qpair *qp = TQP(nq);
1136 
1137 	nvlist_add_number(nvl, "fd", qp->s);
1138 	nvlist_add_number(nvl, "rxpda", qp->rxpda);
1139 	nvlist_add_number(nvl, "txpda", qp->txpda);
1140 	nvlist_add_bool(nvl, "header_digests", qp->header_digests);
1141 	nvlist_add_bool(nvl, "data_digests", qp->data_digests);
1142 	nvlist_add_number(nvl, "maxr2t", qp->maxr2t);
1143 	nvlist_add_number(nvl, "maxh2cdata", qp->maxh2cdata);
1144 	nvlist_add_number(nvl, "max_icd", qp->max_icd);
1145 }
1146 
1147 static struct nvmf_capsule *
1148 tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
1149 {
1150 	struct nvmf_tcp_capsule *nc;
1151 
1152 	nc = calloc(1, sizeof(*nc));
1153 	return (&nc->nc);
1154 }
1155 
1156 static void
1157 tcp_free_capsule(struct nvmf_capsule *nc)
1158 {
1159 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1160 
1161 	nvmf_tcp_free_pdu(&tc->rx_pdu);
1162 	if (tc->cb != NULL)
1163 		tcp_free_command_buffer(tc->cb);
1164 	free(tc);
1165 }
1166 
1167 static int
1168 tcp_transmit_command(struct nvmf_capsule *nc)
1169 {
1170 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1171 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1172 	struct nvme_tcp_cmd cmd;
1173 	struct nvme_sgl_descriptor *sgl;
1174 	int error;
1175 	bool use_icd;
1176 
1177 	use_icd = false;
1178 	if (nc->nc_data_len != 0 && nc->nc_send_data &&
1179 	    nc->nc_data_len <= qp->max_icd)
1180 		use_icd = true;
1181 
1182 	memset(&cmd, 0, sizeof(cmd));
1183 	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1184 	cmd.ccsqe = nc->nc_sqe;
1185 
1186 	/* Populate SGL in SQE. */
1187 	sgl = &cmd.ccsqe.sgl;
1188 	memset(sgl, 0, sizeof(*sgl));
1189 	sgl->address = 0;
1190 	sgl->length = htole32(nc->nc_data_len);
1191 	if (use_icd) {
1192 		/* Use in-capsule data. */
1193 		sgl->type = NVME_SGL_TYPE_ICD;
1194 	} else {
1195 		/* Use a command buffer. */
1196 		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1197 	}
1198 
1199 	/* Send command capsule. */
1200 	error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
1201 	    nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
1202 	if (error != 0)
1203 		return (error);
1204 
1205 	/*
1206 	 * If data will be transferred using a command buffer, allocate a
1207 	 * buffer structure and queue it.
1208 	 */
1209 	if (nc->nc_data_len != 0 && !use_icd)
1210 		tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
1211 		    nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
1212 
1213 	return (0);
1214 }
1215 
1216 static int
1217 tcp_transmit_response(struct nvmf_capsule *nc)
1218 {
1219 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1220 	struct nvme_tcp_rsp rsp;
1221 
1222 	memset(&rsp, 0, sizeof(rsp));
1223 	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1224 	rsp.rccqe = nc->nc_cqe;
1225 
1226 	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1227 }
1228 
1229 static int
1230 tcp_transmit_capsule(struct nvmf_capsule *nc)
1231 {
1232 	if (nc->nc_qe_len == sizeof(struct nvme_command))
1233 		return (tcp_transmit_command(nc));
1234 	else
1235 		return (tcp_transmit_response(nc));
1236 }
1237 
1238 static int
1239 tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
1240 {
1241 	struct nvmf_tcp_qpair *qp = TQP(nq);
1242 	struct nvmf_tcp_capsule *tc;
1243 	int error;
1244 
1245 	while (TAILQ_EMPTY(&qp->rx_capsules)) {
1246 		error = nvmf_tcp_receive_pdu(qp);
1247 		if (error != 0)
1248 			return (error);
1249 	}
1250 	tc = TAILQ_FIRST(&qp->rx_capsules);
1251 	TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1252 	*ncp = &tc->nc;
1253 	return (0);
1254 }
1255 
1256 static uint8_t
1257 tcp_validate_command_capsule(const struct nvmf_capsule *nc)
1258 {
1259 	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1260 	const struct nvme_sgl_descriptor *sgl;
1261 
1262 	assert(tc->rx_pdu.hdr != NULL);
1263 
1264 	sgl = &nc->nc_sqe.sgl;
1265 	switch (sgl->type) {
1266 	case NVME_SGL_TYPE_ICD:
1267 		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1268 			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1269 			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1270 		}
1271 		break;
1272 	case NVME_SGL_TYPE_COMMAND_BUFFER:
1273 		if (tc->rx_pdu.data_len != 0) {
1274 			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1275 			return (NVME_SC_INVALID_FIELD);
1276 		}
1277 		break;
1278 	default:
1279 		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1280 		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1281 	}
1282 
1283 	if (sgl->address != 0) {
1284 		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1285 		return (NVME_SC_SGL_OFFSET_INVALID);
1286 	}
1287 
1288 	return (NVME_SC_SUCCESS);
1289 }
1290 
1291 static size_t
1292 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1293 {
1294 	assert(nc->nc_qe_len == sizeof(struct nvme_command));
1295 	return (le32toh(nc->nc_sqe.sgl.length));
1296 }
1297 
1298 /* NB: cid and ttag are both little-endian already. */
1299 static int
1300 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
1301     uint32_t data_offset, uint32_t data_len)
1302 {
1303 	struct nvme_tcp_r2t_hdr r2t;
1304 
1305 	memset(&r2t, 0, sizeof(r2t));
1306 	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1307 	r2t.cccid = cid;
1308 	r2t.ttag = ttag;
1309 	r2t.r2to = htole32(data_offset);
1310 	r2t.r2tl = htole32(data_len);
1311 
1312 	return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
1313 }
1314 
1315 static int
1316 tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1317     void *buf, size_t len)
1318 {
1319 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1320 	struct nvmf_tcp_command_buffer *cb;
1321 	int error;
1322 	uint16_t ttag;
1323 
1324 	/*
1325 	 * Don't bother byte-swapping ttag as it is just a cookie
1326 	 * value returned by the other end as-is.
1327 	 */
1328 	ttag = qp->next_ttag++;
1329 
1330 	error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
1331 	if (error != 0)
1332 		return (error);
1333 
1334 	cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
1335 	    nc->nc_sqe.cid, ttag, true);
1336 
1337 	/* Parse received PDUs until the data transfer is complete. */
1338 	while (cb->data_xfered < cb->data_len) {
1339 		error = nvmf_tcp_receive_pdu(qp);
1340 		if (error != 0)
1341 			break;
1342 	}
1343 	tcp_free_command_buffer(cb);
1344 	return (error);
1345 }
1346 
1347 static int
1348 tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1349     void *buf, size_t len)
1350 {
1351 	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1352 	const char *icd;
1353 
1354 	icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
1355 	memcpy(buf, icd, len);
1356 	return (0);
1357 }
1358 
1359 static int
1360 tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1361     void *buf, size_t len)
1362 {
1363 	struct nvmf_association *na = nc->nc_qpair->nq_association;
1364 	const struct nvme_sgl_descriptor *sgl;
1365 	size_t data_len;
1366 
1367 	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1368 		return (EINVAL);
1369 
1370 	sgl = &nc->nc_sqe.sgl;
1371 	data_len = le32toh(sgl->length);
1372 	if (data_offset + len > data_len)
1373 		return (EFBIG);
1374 
1375 	if (sgl->type == NVME_SGL_TYPE_ICD)
1376 		return (tcp_receive_icd_data(nc, data_offset, buf, len));
1377 	else
1378 		return (tcp_receive_r2t_data(nc, data_offset, buf, len));
1379 }
1380 
1381 /* NB: cid is little-endian already. */
1382 static int
1383 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
1384     uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
1385     bool success)
1386 {
1387 	struct nvme_tcp_c2h_data_hdr c2h;
1388 
1389 	memset(&c2h, 0, sizeof(c2h));
1390 	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1391 	if (last_pdu)
1392 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1393 	if (success)
1394 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1395 	c2h.cccid = cid;
1396 	c2h.datao = htole32(data_offset);
1397 	c2h.datal = htole32(len);
1398 
1399 	return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
1400 	    __DECONST(void *, buf), len));
1401 }
1402 
1403 static int
1404 tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
1405     size_t len)
1406 {
1407 	struct nvmf_association *na = nc->nc_qpair->nq_association;
1408 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1409 	const struct nvme_sgl_descriptor *sgl;
1410 	const char *src;
1411 	size_t todo;
1412 	uint32_t data_len, data_offset;
1413 	int error;
1414 	bool last_pdu, send_success_flag;
1415 
1416 	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1417 		return (EINVAL);
1418 
1419 	sgl = &nc->nc_sqe.sgl;
1420 	data_len = le32toh(sgl->length);
1421 	if (len != data_len) {
1422 		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1423 		return (EFBIG);
1424 	}
1425 
1426 	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1427 		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1428 		return (EINVAL);
1429 	}
1430 
1431 	/* Use the SUCCESS flag if SQ flow control is disabled. */
1432 	send_success_flag = !qp->qp.nq_flow_control;
1433 
1434 	/*
1435 	 * Write out one or more C2H_DATA PDUs containing the data.
1436 	 * Each PDU is arbitrarily capped at 256k.
1437 	 */
1438 	data_offset = 0;
1439 	src = buf;
1440 	while (len > 0) {
1441 		if (len > 256 * 1024) {
1442 			todo = 256 * 1024;
1443 			last_pdu = false;
1444 		} else {
1445 			todo = len;
1446 			last_pdu = true;
1447 		}
1448 		error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
1449 		    src, todo, last_pdu, last_pdu && send_success_flag);
1450 		if (error != 0) {
1451 			nvmf_send_generic_error(nc,
1452 			    NVME_SC_TRANSIENT_TRANSPORT_ERROR);
1453 			return (error);
1454 		}
1455 		data_offset += todo;
1456 		src += todo;
1457 		len -= todo;
1458 	}
1459 	if (!send_success_flag)
1460 		nvmf_send_success(nc);
1461 	return (0);
1462 }
1463 
1464 struct nvmf_transport_ops tcp_ops = {
1465 	.allocate_association = tcp_allocate_association,
1466 	.update_association = tcp_update_association,
1467 	.free_association = tcp_free_association,
1468 	.allocate_qpair = tcp_allocate_qpair,
1469 	.free_qpair = tcp_free_qpair,
1470 	.kernel_handoff_params = tcp_kernel_handoff_params,
1471 	.allocate_capsule = tcp_allocate_capsule,
1472 	.free_capsule = tcp_free_capsule,
1473 	.transmit_capsule = tcp_transmit_capsule,
1474 	.receive_capsule = tcp_receive_capsule,
1475 	.validate_command_capsule = tcp_validate_command_capsule,
1476 	.capsule_data_len = tcp_capsule_data_len,
1477 	.receive_controller_data = tcp_receive_controller_data,
1478 	.send_controller_data = tcp_send_controller_data,
1479 };
1480