xref: /freebsd/lib/libnvmf/nvmf_tcp.c (revision 8bba2c0f8958443790b1f3abc0675719da987e87)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/endian.h>
9 #include <sys/gsb_crc32.h>
10 #include <sys/queue.h>
11 #include <sys/socket.h>
12 #include <sys/uio.h>
13 #include <assert.h>
14 #include <errno.h>
15 #include <netdb.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include <unistd.h>
20 
21 #include "libnvmf.h"
22 #include "internal.h"
23 #include "nvmf_tcp.h"
24 
25 struct nvmf_tcp_qpair;
26 
27 struct nvmf_tcp_command_buffer {
28 	struct nvmf_tcp_qpair *qp;
29 
30 	void	*data;
31 	size_t	data_len;
32 	size_t	data_xfered;
33 	uint32_t data_offset;
34 
35 	uint16_t cid;
36 	uint16_t ttag;
37 
38 	LIST_ENTRY(nvmf_tcp_command_buffer) link;
39 };
40 
41 LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
42 
43 struct nvmf_tcp_association {
44 	struct nvmf_association na;
45 
46 	uint32_t ioccsz;
47 };
48 
49 struct nvmf_tcp_rxpdu {
50 	struct nvme_tcp_common_pdu_hdr *hdr;
51 	uint32_t data_len;
52 };
53 
54 struct nvmf_tcp_capsule {
55 	struct nvmf_capsule nc;
56 
57 	struct nvmf_tcp_rxpdu rx_pdu;
58 	struct nvmf_tcp_command_buffer *cb;
59 
60 	TAILQ_ENTRY(nvmf_tcp_capsule) link;
61 };
62 
63 struct nvmf_tcp_qpair {
64 	struct nvmf_qpair qp;
65 	int s;
66 
67 	uint8_t	txpda;
68 	uint8_t rxpda;
69 	bool header_digests;
70 	bool data_digests;
71 	uint32_t maxr2t;
72 	uint32_t maxh2cdata;
73 	uint32_t max_icd;	/* Host only */
74 	uint16_t next_ttag;	/* Controller only */
75 
76 	struct nvmf_tcp_command_buffer_list tx_buffers;
77 	struct nvmf_tcp_command_buffer_list rx_buffers;
78 	TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
79 };
80 
81 #define	TASSOC(nc)	((struct nvmf_tcp_association *)(na))
82 #define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
83 #define	CTCAP(nc)	((const struct nvmf_tcp_capsule *)(nc))
84 #define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
85 
86 static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
87 
88 static uint32_t
compute_digest(const void * buf,size_t len)89 compute_digest(const void *buf, size_t len)
90 {
91 	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
92 }
93 
94 static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,void * data,uint32_t data_offset,size_t data_len,uint16_t cid,uint16_t ttag,bool receive)95 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
96     uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
97     bool receive)
98 {
99 	struct nvmf_tcp_command_buffer *cb;
100 
101 	cb = malloc(sizeof(*cb));
102 	cb->qp = qp;
103 	cb->data = data;
104 	cb->data_offset = data_offset;
105 	cb->data_len = data_len;
106 	cb->data_xfered = 0;
107 	cb->cid = cid;
108 	cb->ttag = ttag;
109 
110 	if (receive)
111 		LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
112 	else
113 		LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
114 	return (cb);
115 }
116 
117 static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)118 tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
119     bool receive)
120 {
121 	struct nvmf_tcp_command_buffer_list *list;
122 	struct nvmf_tcp_command_buffer *cb;
123 
124 	list = receive ? &qp->rx_buffers : &qp->tx_buffers;
125 	LIST_FOREACH(cb, list, link) {
126 		if (cb->cid == cid && cb->ttag == ttag)
127 			return (cb);
128 	}
129 	return (NULL);
130 }
131 
132 static void
tcp_purge_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)133 tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
134     bool receive)
135 {
136 	struct nvmf_tcp_command_buffer *cb;
137 
138 	cb = tcp_find_command_buffer(qp, cid, ttag, receive);
139 	if (cb != NULL)
140 		LIST_REMOVE(cb, link);
141 }
142 
143 static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)144 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
145 {
146 	LIST_REMOVE(cb, link);
147 	free(cb);
148 }
149 
150 static int
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,const void * pdu,size_t len)151 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
152 {
153 	ssize_t nwritten;
154 	const char *cp;
155 
156 	cp = pdu;
157 	while (len != 0) {
158 		nwritten = write(qp->s, cp, len);
159 		if (nwritten < 0)
160 			return (errno);
161 		len -= nwritten;
162 		cp += nwritten;
163 	}
164 	return (0);
165 }
166 
167 static int
nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair * qp,struct iovec * iov,u_int iovcnt,size_t len)168 nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
169     u_int iovcnt, size_t len)
170 {
171 	ssize_t nwritten;
172 
173 	for (;;) {
174 		nwritten = writev(qp->s, iov, iovcnt);
175 		if (nwritten < 0)
176 			return (errno);
177 
178 		len -= nwritten;
179 		if (len == 0)
180 			return (0);
181 
182 		while (iov->iov_len <= (size_t)nwritten) {
183 			nwritten -= iov->iov_len;
184 			iovcnt--;
185 			iov++;
186 		}
187 
188 		iov->iov_base = (char *)iov->iov_base + nwritten;
189 		iov->iov_len -= nwritten;
190 	}
191 }
192 
193 static void
nvmf_tcp_report_error(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,const void * rx_pdu,size_t pdu_len,u_int hlen)194 nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
195     uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
196 {
197 	struct nvme_tcp_term_req_hdr hdr;
198 	struct iovec iov[2];
199 
200 	if (hlen != 0) {
201 		if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
202 			hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
203 		if (hlen > pdu_len)
204 			hlen = pdu_len;
205 	}
206 
207 	memset(&hdr, 0, sizeof(hdr));
208 	hdr.common.pdu_type = na->na_controller ?
209 	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
210 	hdr.common.hlen = sizeof(hdr);
211 	hdr.common.plen = sizeof(hdr) + hlen;
212 	hdr.fes = htole16(fes);
213 	le32enc(hdr.fei, fei);
214 	iov[0].iov_base = &hdr;
215 	iov[0].iov_len = sizeof(hdr);
216 	iov[1].iov_base = __DECONST(void *, rx_pdu);
217 	iov[1].iov_len = hlen;
218 
219 	(void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
220 	close(qp->s);
221 	qp->s = -1;
222 }
223 
224 static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu,size_t pdu_len)225 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
226     size_t pdu_len)
227 {
228 	const struct nvme_tcp_common_pdu_hdr *ch;
229 	uint32_t data_len, fei, plen;
230 	uint32_t digest, rx_digest;
231 	u_int hlen;
232 	int error;
233 	uint16_t fes;
234 
235 	/* Determine how large of a PDU header to return for errors. */
236 	ch = pdu->hdr;
237 	hlen = ch->hlen;
238 	plen = le32toh(ch->plen);
239 	if (hlen < sizeof(*ch) || hlen > plen)
240 		hlen = sizeof(*ch);
241 
242 	error = nvmf_tcp_validate_pdu_header(ch,
243 	    qp->qp.nq_association->na_controller, qp->header_digests,
244 	    qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
245 	if (error != 0) {
246 		if (error == ECONNRESET) {
247 			close(qp->s);
248 			qp->s = -1;
249 		} else {
250 			nvmf_tcp_report_error(qp->qp.nq_association, qp,
251 			    fes, fei, ch, pdu_len, hlen);
252 		}
253 		return (error);
254 	}
255 
256 	/* Check header digest if present. */
257 	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
258 		digest = compute_digest(ch, ch->hlen);
259 		memcpy(&rx_digest, (const char *)ch + ch->hlen,
260 		    sizeof(rx_digest));
261 		if (digest != rx_digest) {
262 			printf("NVMe/TCP: Header digest mismatch\n");
263 			nvmf_tcp_report_error(qp->qp.nq_association, qp,
264 			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
265 			    pdu_len, hlen);
266 			return (EBADMSG);
267 		}
268 	}
269 
270 	/* Check data digest if present. */
271 	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
272 		digest = compute_digest((const char *)ch + ch->pdo, data_len);
273 		memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
274 		    sizeof(rx_digest));
275 		if (digest != rx_digest) {
276 			printf("NVMe/TCP: Data digest mismatch\n");
277 			return (EBADMSG);
278 		}
279 	}
280 
281 	pdu->data_len = data_len;
282 	return (0);
283 }
284 
285 /*
286  * Read data from a socket, retrying until the data has been fully
287  * read or an error occurs.
288  */
289 static int
nvmf_tcp_read_buffer(int s,void * buf,size_t len)290 nvmf_tcp_read_buffer(int s, void *buf, size_t len)
291 {
292 	ssize_t nread;
293 	char *cp;
294 
295 	cp = buf;
296 	while (len != 0) {
297 		nread = read(s, cp, len);
298 		if (nread < 0)
299 			return (errno);
300 		if (nread == 0)
301 			return (ECONNRESET);
302 		len -= nread;
303 		cp += nread;
304 	}
305 	return (0);
306 }
307 
308 static int
nvmf_tcp_read_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)309 nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
310 {
311 	struct nvme_tcp_common_pdu_hdr ch;
312 	uint32_t plen;
313 	int error;
314 
315 	memset(pdu, 0, sizeof(*pdu));
316 	error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
317 	if (error != 0)
318 		return (error);
319 
320 	plen = le32toh(ch.plen);
321 
322 	/*
323 	 * Validate a header with garbage lengths to trigger
324 	 * an error message without reading more.
325 	 */
326 	if (plen < sizeof(ch) || ch.hlen > plen) {
327 		pdu->hdr = &ch;
328 		error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
329 		pdu->hdr = NULL;
330 		assert(error != 0);
331 		return (error);
332 	}
333 
334 	/* Read the rest of the PDU. */
335 	pdu->hdr = malloc(plen);
336 	memcpy(pdu->hdr, &ch, sizeof(ch));
337 	error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
338 	if (error != 0)
339 		return (error);
340 	error = nvmf_tcp_validate_pdu(qp, pdu, plen);
341 	if (error != 0) {
342 		free(pdu->hdr);
343 		pdu->hdr = NULL;
344 	}
345 	return (error);
346 }
347 
348 static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)349 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
350 {
351 	free(pdu->hdr);
352 	pdu->hdr = NULL;
353 }
354 
355 static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)356 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
357 {
358 	struct nvme_tcp_term_req_hdr *hdr;
359 
360 	hdr = (void *)pdu->hdr;
361 
362 	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
363 	    le16toh(hdr->fes), le32dec(hdr->fei));
364 	nvmf_tcp_free_pdu(pdu);
365 	return (ECONNRESET);
366 }
367 
368 static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)369 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
370     struct nvmf_tcp_rxpdu *pdu)
371 {
372 	struct nvme_tcp_cmd *cmd;
373 	struct nvmf_capsule *nc;
374 	struct nvmf_tcp_capsule *tc;
375 
376 	cmd = (void *)pdu->hdr;
377 
378 	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
379 	if (nc == NULL)
380 		return (ENOMEM);
381 
382 	tc = TCAP(nc);
383 	tc->rx_pdu = *pdu;
384 
385 	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
386 	return (0);
387 }
388 
389 static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)390 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
391     struct nvmf_tcp_rxpdu *pdu)
392 {
393 	struct nvme_tcp_rsp *rsp;
394 	struct nvmf_capsule *nc;
395 	struct nvmf_tcp_capsule *tc;
396 
397 	rsp = (void *)pdu->hdr;
398 
399 	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
400 	if (nc == NULL)
401 		return (ENOMEM);
402 
403 	nc->nc_sqhd_valid = true;
404 	tc = TCAP(nc);
405 	tc->rx_pdu = *pdu;
406 
407 	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
408 
409 	/*
410 	 * Once the CQE has been received, no further transfers to the
411 	 * command buffer for the associated CID can occur.
412 	 */
413 	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
414 	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
415 
416 	return (0);
417 }
418 
419 /*
420  * Construct and send a PDU that contains an optional data payload.
421  * This includes dealing with digests and the length fields in the
422  * common header.
423  */
424 static int
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,void * data,uint32_t data_len)425 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
426     void *data, uint32_t data_len)
427 {
428 	struct nvme_tcp_common_pdu_hdr *ch;
429 	struct iovec iov[5];
430 	u_int iovcnt;
431 	uint32_t header_digest, data_digest, pad, pdo, plen;
432 
433 	plen = hlen;
434 	if (qp->header_digests)
435 		plen += sizeof(header_digest);
436 	if (data_len != 0) {
437 		pdo = roundup(plen, qp->txpda);
438 		pad = pdo - plen;
439 		plen = pdo + data_len;
440 		if (qp->data_digests)
441 			plen += sizeof(data_digest);
442 	} else {
443 		assert(data == NULL);
444 		pdo = 0;
445 		pad = 0;
446 	}
447 
448 	ch = hdr;
449 	ch->hlen = hlen;
450 	if (qp->header_digests)
451 		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
452 	if (qp->data_digests && data_len != 0)
453 		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
454 	ch->pdo = pdo;
455 	ch->plen = htole32(plen);
456 
457 	/* CH + PSH */
458 	iov[0].iov_base = hdr;
459 	iov[0].iov_len = hlen;
460 	iovcnt = 1;
461 
462 	/* HDGST */
463 	if (qp->header_digests) {
464 		header_digest = compute_digest(hdr, hlen);
465 		iov[iovcnt].iov_base = &header_digest;
466 		iov[iovcnt].iov_len = sizeof(header_digest);
467 		iovcnt++;
468 	}
469 
470 	if (pad != 0) {
471 		/* PAD */
472 		iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
473 		iov[iovcnt].iov_len = pad;
474 		iovcnt++;
475 	}
476 
477 	if (data_len != 0) {
478 		/* DATA */
479 		iov[iovcnt].iov_base = data;
480 		iov[iovcnt].iov_len = data_len;
481 		iovcnt++;
482 
483 		/* DDGST */
484 		if (qp->data_digests) {
485 			data_digest = compute_digest(data, data_len);
486 			iov[iovcnt].iov_base = &data_digest;
487 			iov[iovcnt].iov_len = sizeof(data_digest);
488 			iovcnt++;
489 		}
490 	}
491 
492 	return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
493 }
494 
495 static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)496 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
497 {
498 	struct nvme_tcp_h2c_data_hdr *h2c;
499 	struct nvmf_tcp_command_buffer *cb;
500 	uint32_t data_len, data_offset;
501 	const char *icd;
502 
503 	h2c = (void *)pdu->hdr;
504 	if (le32toh(h2c->datal) > qp->maxh2cdata) {
505 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
506 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
507 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
508 		nvmf_tcp_free_pdu(pdu);
509 		return (EBADMSG);
510 	}
511 
512 	cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
513 	if (cb == NULL) {
514 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
515 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
516 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
517 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
518 		nvmf_tcp_free_pdu(pdu);
519 		return (EBADMSG);
520 	}
521 
522 	data_len = le32toh(h2c->datal);
523 	if (data_len != pdu->data_len) {
524 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
525 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
526 		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
527 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
528 		nvmf_tcp_free_pdu(pdu);
529 		return (EBADMSG);
530 	}
531 
532 	data_offset = le32toh(h2c->datao);
533 	if (data_offset < cb->data_offset ||
534 	    data_offset + data_len > cb->data_offset + cb->data_len) {
535 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
536 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
537 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
538 		nvmf_tcp_free_pdu(pdu);
539 		return (EBADMSG);
540 	}
541 
542 	if (data_offset != cb->data_offset + cb->data_xfered) {
543 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
544 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
545 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
546 		nvmf_tcp_free_pdu(pdu);
547 		return (EBADMSG);
548 	}
549 
550 	if ((cb->data_xfered + data_len == cb->data_len) !=
551 	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
552 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
553 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
554 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
555 		nvmf_tcp_free_pdu(pdu);
556 		return (EBADMSG);
557 	}
558 
559 	cb->data_xfered += data_len;
560 	data_offset -= cb->data_offset;
561 	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
562 	memcpy((char *)cb->data + data_offset, icd, data_len);
563 
564 	nvmf_tcp_free_pdu(pdu);
565 	return (0);
566 }
567 
568 static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)569 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
570 {
571 	struct nvme_tcp_c2h_data_hdr *c2h;
572 	struct nvmf_tcp_command_buffer *cb;
573 	uint32_t data_len, data_offset;
574 	const char *icd;
575 
576 	c2h = (void *)pdu->hdr;
577 
578 	cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
579 	if (cb == NULL) {
580 		/*
581 		 * XXX: Could be PDU sequence error if cccid is for a
582 		 * command that doesn't use a command buffer.
583 		 */
584 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
585 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
586 		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
587 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
588 		nvmf_tcp_free_pdu(pdu);
589 		return (EBADMSG);
590 	}
591 
592 	data_len = le32toh(c2h->datal);
593 	if (data_len != pdu->data_len) {
594 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
595 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
596 		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
597 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
598 		nvmf_tcp_free_pdu(pdu);
599 		return (EBADMSG);
600 	}
601 
602 	data_offset = le32toh(c2h->datao);
603 	if (data_offset < cb->data_offset ||
604 	    data_offset + data_len > cb->data_offset + cb->data_len) {
605 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
606 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
607 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
608 		nvmf_tcp_free_pdu(pdu);
609 		return (EBADMSG);
610 	}
611 
612 	if (data_offset != cb->data_offset + cb->data_xfered) {
613 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
614 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
615 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
616 		nvmf_tcp_free_pdu(pdu);
617 		return (EBADMSG);
618 	}
619 
620 	if ((cb->data_xfered + data_len == cb->data_len) !=
621 	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
622 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
623 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
624 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
625 		nvmf_tcp_free_pdu(pdu);
626 		return (EBADMSG);
627 	}
628 
629 	cb->data_xfered += data_len;
630 	data_offset -= cb->data_offset;
631 	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
632 	memcpy((char *)cb->data + data_offset, icd, data_len);
633 
634 	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
635 		struct nvme_completion cqe;
636 		struct nvmf_tcp_capsule *tc;
637 		struct nvmf_capsule *nc;
638 
639 		memset(&cqe, 0, sizeof(cqe));
640 		cqe.cid = cb->cid;
641 
642 		nc = nvmf_allocate_response(&qp->qp, &cqe);
643 		if (nc == NULL) {
644 			nvmf_tcp_free_pdu(pdu);
645 			return (ENOMEM);
646 		}
647 		nc->nc_sqhd_valid = false;
648 
649 		tc = TCAP(nc);
650 		TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
651 	}
652 
653 	nvmf_tcp_free_pdu(pdu);
654 	return (0);
655 }
656 
657 /* NB: cid and ttag and little-endian already. */
658 static int
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)659 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
660     uint32_t data_offset, void *buf, size_t len, bool last_pdu)
661 {
662 	struct nvme_tcp_h2c_data_hdr h2c;
663 
664 	memset(&h2c, 0, sizeof(h2c));
665 	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
666 	if (last_pdu)
667 		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
668 	h2c.cccid = cid;
669 	h2c.ttag = ttag;
670 	h2c.datao = htole32(data_offset);
671 	h2c.datal = htole32(len);
672 
673 	return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
674 }
675 
676 /* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
677 static int
tcp_send_h2c_pdus(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)678 tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
679     uint32_t data_offset, void *buf, size_t len, bool last_pdu)
680 {
681 	char *p;
682 
683 	p = buf;
684 	while (len != 0) {
685 		size_t todo;
686 		int error;
687 
688 		todo = len;
689 		if (todo > qp->maxh2cdata)
690 			todo = qp->maxh2cdata;
691 		error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
692 		    last_pdu && todo == len);
693 		if (error != 0)
694 			return (error);
695 		p += todo;
696 		len -= todo;
697 	}
698 	return (0);
699 }
700 
701 static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)702 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
703 {
704 	struct nvmf_tcp_command_buffer *cb;
705 	struct nvme_tcp_r2t_hdr *r2t;
706 	uint32_t data_len, data_offset;
707 	int error;
708 
709 	r2t = (void *)pdu->hdr;
710 
711 	cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
712 	if (cb == NULL) {
713 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
714 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
715 		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
716 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
717 		nvmf_tcp_free_pdu(pdu);
718 		return (EBADMSG);
719 	}
720 
721 	data_offset = le32toh(r2t->r2to);
722 	if (data_offset != cb->data_xfered) {
723 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
724 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
725 		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
726 		nvmf_tcp_free_pdu(pdu);
727 		return (EBADMSG);
728 	}
729 
730 	/*
731 	 * XXX: The spec does not specify how to handle R2T tranfers
732 	 * out of range of the original command.
733 	 */
734 	data_len = le32toh(r2t->r2tl);
735 	if (data_offset + data_len > cb->data_len) {
736 		nvmf_tcp_report_error(qp->qp.nq_association, qp,
737 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
738 		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
739 		nvmf_tcp_free_pdu(pdu);
740 		return (EBADMSG);
741 	}
742 
743 	cb->data_xfered += data_len;
744 
745 	/*
746 	 * Write out one or more H2C_DATA PDUs containing the
747 	 * requested data.
748 	 */
749 	error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
750 	    data_offset, (char *)cb->data + data_offset, data_len, true);
751 
752 	nvmf_tcp_free_pdu(pdu);
753 	return (error);
754 }
755 
756 static int
nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair * qp)757 nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
758 {
759 	struct nvmf_tcp_rxpdu pdu;
760 	int error;
761 
762 	error = nvmf_tcp_read_pdu(qp, &pdu);
763 	if (error != 0)
764 		return (error);
765 
766 	switch (pdu.hdr->pdu_type) {
767 	default:
768 		__unreachable();
769 		break;
770 	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
771 	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
772 		return (nvmf_tcp_handle_term_req(&pdu));
773 	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
774 		return (nvmf_tcp_save_command_capsule(qp, &pdu));
775 	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
776 		return (nvmf_tcp_save_response_capsule(qp, &pdu));
777 	case NVME_TCP_PDU_TYPE_H2C_DATA:
778 		return (nvmf_tcp_handle_h2c_data(qp, &pdu));
779 	case NVME_TCP_PDU_TYPE_C2H_DATA:
780 		return (nvmf_tcp_handle_c2h_data(qp, &pdu));
781 	case NVME_TCP_PDU_TYPE_R2T:
782 		return (nvmf_tcp_handle_r2t(qp, &pdu));
783 	}
784 }
785 
786 static bool
nvmf_tcp_validate_ic_pdu(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,size_t pdu_len)787 nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
788     const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
789 {
790 	const struct nvme_tcp_ic_req *pdu;
791 	uint32_t plen;
792 	u_int hlen;
793 
794 	/* Determine how large of a PDU header to return for errors. */
795 	hlen = ch->hlen;
796 	plen = le32toh(ch->plen);
797 	if (hlen < sizeof(*ch) || hlen > plen)
798 		hlen = sizeof(*ch);
799 
800 	/*
801 	 * Errors must be reported for the lowest incorrect field
802 	 * first, so validate fields in order.
803 	 */
804 
805 	/* Validate pdu_type. */
806 
807 	/* Controllers only receive PDUs with a PDU direction of 0. */
808 	if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) {
809 		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
810 		nvmf_tcp_report_error(na, qp,
811 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
812 		    hlen);
813 		return (false);
814 	}
815 
816 	switch (ch->pdu_type) {
817 	case NVME_TCP_PDU_TYPE_IC_REQ:
818 	case NVME_TCP_PDU_TYPE_IC_RESP:
819 		break;
820 	default:
821 		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
822 		nvmf_tcp_report_error(na, qp,
823 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
824 		    hlen);
825 		return (false);
826 	}
827 
828 	/* Validate flags. */
829 	if (ch->flags != 0) {
830 		na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
831 		    ch->flags);
832 		nvmf_tcp_report_error(na, qp,
833 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
834 		    hlen);
835 		return (false);
836 	}
837 
838 	/* Validate hlen. */
839 	if (ch->hlen != 128) {
840 		na_error(na, "NVMe/TCP: Invalid PDU header length %u",
841 		    ch->hlen);
842 		nvmf_tcp_report_error(na, qp,
843 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
844 		    hlen);
845 		return (false);
846 	}
847 
848 	/* Validate pdo. */
849 	if (ch->pdo != 0) {
850 		na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
851 		nvmf_tcp_report_error(na, qp,
852 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
853 		    hlen);
854 		return (false);
855 	}
856 
857 	/* Validate plen. */
858 	if (plen != 128) {
859 		na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
860 		nvmf_tcp_report_error(na, qp,
861 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
862 		    hlen);
863 		return (false);
864 	}
865 
866 	/* Validate fields common to both ICReq and ICResp. */
867 	pdu = (const struct nvme_tcp_ic_req *)ch;
868 	if (le16toh(pdu->pfv) != 0) {
869 		na_error(na, "NVMe/TCP: Unsupported PDU version %u",
870 		    le16toh(pdu->pfv));
871 		nvmf_tcp_report_error(na, qp,
872 		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
873 		    8, ch, pdu_len, hlen);
874 		return (false);
875 	}
876 
877 	if (pdu->hpda > NVME_TCP_HPDA_MAX) {
878 		na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
879 		nvmf_tcp_report_error(na, qp,
880 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
881 		    hlen);
882 		return (false);
883 	}
884 
885 	if (pdu->dgst.bits.reserved != 0) {
886 		na_error(na, "NVMe/TCP: Invalid digest settings");
887 		nvmf_tcp_report_error(na, qp,
888 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
889 		    hlen);
890 		return (false);
891 	}
892 
893 	return (true);
894 }
895 
896 static bool
nvmf_tcp_read_ic_req(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_req * pdu)897 nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
898     struct nvme_tcp_ic_req *pdu)
899 {
900 	int error;
901 
902 	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
903 	if (error != 0) {
904 		na_error(na, "NVMe/TCP: Failed to read IC request: %s",
905 		    strerror(error));
906 		return (false);
907 	}
908 
909 	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
910 }
911 
912 static bool
nvmf_tcp_read_ic_resp(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_resp * pdu)913 nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
914     struct nvme_tcp_ic_resp *pdu)
915 {
916 	int error;
917 
918 	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
919 	if (error != 0) {
920 		na_error(na, "NVMe/TCP: Failed to read IC response: %s",
921 		    strerror(error));
922 		return (false);
923 	}
924 
925 	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
926 }
927 
928 static struct nvmf_association *
tcp_allocate_association(bool controller,const struct nvmf_association_params * params)929 tcp_allocate_association(bool controller,
930     const struct nvmf_association_params *params)
931 {
932 	struct nvmf_tcp_association *ta;
933 
934 	if (controller) {
935 		/* 7.4.10.3 */
936 		if (params->tcp.maxh2cdata < 4096 ||
937 		    params->tcp.maxh2cdata % 4 != 0)
938 			return (NULL);
939 	}
940 
941 	ta = calloc(1, sizeof(*ta));
942 
943 	return (&ta->na);
944 }
945 
946 static void
tcp_update_association(struct nvmf_association * na,const struct nvme_controller_data * cdata)947 tcp_update_association(struct nvmf_association *na,
948     const struct nvme_controller_data *cdata)
949 {
950 	struct nvmf_tcp_association *ta = TASSOC(na);
951 
952 	ta->ioccsz = le32toh(cdata->ioccsz);
953 }
954 
955 static void
tcp_free_association(struct nvmf_association * na)956 tcp_free_association(struct nvmf_association *na)
957 {
958 	free(na);
959 }
960 
961 static bool
tcp_connect(struct nvmf_tcp_qpair * qp,struct nvmf_association * na,bool admin)962 tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
963 {
964 	const struct nvmf_association_params *params = &na->na_params;
965 	struct nvmf_tcp_association *ta = TASSOC(na);
966 	struct nvme_tcp_ic_req ic_req;
967 	struct nvme_tcp_ic_resp ic_resp;
968 	uint32_t maxh2cdata;
969 	int error;
970 
971 	if (!admin) {
972 		if (ta->ioccsz == 0) {
973 			na_error(na, "TCP I/O queues require cdata");
974 			return (false);
975 		}
976 		if (ta->ioccsz < 4) {
977 			na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
978 			return (false);
979 		}
980 	}
981 
982 	memset(&ic_req, 0, sizeof(ic_req));
983 	ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
984 	ic_req.common.hlen = sizeof(ic_req);
985 	ic_req.common.plen = htole32(sizeof(ic_req));
986 	ic_req.pfv = htole16(0);
987 	ic_req.hpda = params->tcp.pda;
988 	if (params->tcp.header_digests)
989 		ic_req.dgst.bits.hdgst_enable = 1;
990 	if (params->tcp.data_digests)
991 		ic_req.dgst.bits.ddgst_enable = 1;
992 	ic_req.maxr2t = htole32(params->tcp.maxr2t);
993 
994 	error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
995 	if (error != 0) {
996 		na_error(na, "Failed to write IC request: %s", strerror(error));
997 		return (false);
998 	}
999 
1000 	if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
1001 		return (false);
1002 
1003 	/* Ensure the controller didn't enable digests we didn't request. */
1004 	if ((!params->tcp.header_digests &&
1005 	    ic_resp.dgst.bits.hdgst_enable != 0) ||
1006 	    (!params->tcp.data_digests &&
1007 	    ic_resp.dgst.bits.ddgst_enable != 0)) {
1008 		na_error(na, "Controller enabled unrequested digests");
1009 		nvmf_tcp_report_error(na, qp,
1010 		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
1011 		    11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
1012 		return (false);
1013 	}
1014 
1015 	/*
1016 	 * XXX: Is there an upper-bound to enforce here?  Perhaps pick
1017 	 * some large value and report larger values as an unsupported
1018 	 * parameter?
1019 	 */
1020 	maxh2cdata = le32toh(ic_resp.maxh2cdata);
1021 	if (maxh2cdata < 4096 || maxh2cdata % 4 != 0) {
1022 		na_error(na, "Invalid MAXH2CDATA %u", maxh2cdata);
1023 		nvmf_tcp_report_error(na, qp,
1024 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
1025 		    sizeof(ic_resp), sizeof(ic_resp));
1026 		return (false);
1027 	}
1028 
1029 	qp->rxpda = (params->tcp.pda + 1) * 4;
1030 	qp->txpda = (ic_resp.cpda + 1) * 4;
1031 	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1032 	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1033 	qp->maxr2t = params->tcp.maxr2t;
1034 	qp->maxh2cdata = maxh2cdata;
1035 	if (admin)
1036 		/* 7.4.3 */
1037 		qp->max_icd = 8192;
1038 	else
1039 		qp->max_icd = (ta->ioccsz - 4) * 16;
1040 
1041 	return (0);
1042 }
1043 
1044 static bool
tcp_accept(struct nvmf_tcp_qpair * qp,struct nvmf_association * na)1045 tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
1046 {
1047 	const struct nvmf_association_params *params = &na->na_params;
1048 	struct nvme_tcp_ic_req ic_req;
1049 	struct nvme_tcp_ic_resp ic_resp;
1050 	int error;
1051 
1052 	if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
1053 		return (false);
1054 
1055 	memset(&ic_resp, 0, sizeof(ic_resp));
1056 	ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
1057 	ic_resp.common.hlen = sizeof(ic_req);
1058 	ic_resp.common.plen = htole32(sizeof(ic_req));
1059 	ic_resp.pfv = htole16(0);
1060 	ic_resp.cpda = params->tcp.pda;
1061 	if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
1062 		ic_resp.dgst.bits.hdgst_enable = 1;
1063 	if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
1064 		ic_resp.dgst.bits.ddgst_enable = 1;
1065 	ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
1066 
1067 	error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
1068 	if (error != 0) {
1069 		na_error(na, "Failed to write IC response: %s",
1070 		    strerror(error));
1071 		return (false);
1072 	}
1073 
1074 	qp->rxpda = (params->tcp.pda + 1) * 4;
1075 	qp->txpda = (ic_req.hpda + 1) * 4;
1076 	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1077 	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1078 	qp->maxr2t = le32toh(ic_req.maxr2t);
1079 	qp->maxh2cdata = params->tcp.maxh2cdata;
1080 	qp->max_icd = 0;	/* XXX */
1081 	return (0);
1082 }
1083 
1084 static struct nvmf_qpair *
tcp_allocate_qpair(struct nvmf_association * na,const struct nvmf_qpair_params * qparams)1085 tcp_allocate_qpair(struct nvmf_association *na,
1086     const struct nvmf_qpair_params *qparams)
1087 {
1088 	const struct nvmf_association_params *aparams = &na->na_params;
1089 	struct nvmf_tcp_qpair *qp;
1090 	int error;
1091 
1092 	if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
1093 		na_error(na, "Invalid PDA");
1094 		return (NULL);
1095 	}
1096 
1097 	qp = calloc(1, sizeof(*qp));
1098 	qp->s = qparams->tcp.fd;
1099 	LIST_INIT(&qp->rx_buffers);
1100 	LIST_INIT(&qp->tx_buffers);
1101 	TAILQ_INIT(&qp->rx_capsules);
1102 	if (na->na_controller)
1103 		error = tcp_accept(qp, na);
1104 	else
1105 		error = tcp_connect(qp, na, qparams->admin);
1106 	if (error != 0) {
1107 		free(qp);
1108 		return (NULL);
1109 	}
1110 
1111 	return (&qp->qp);
1112 }
1113 
1114 static void
tcp_free_qpair(struct nvmf_qpair * nq)1115 tcp_free_qpair(struct nvmf_qpair *nq)
1116 {
1117 	struct nvmf_tcp_qpair *qp = TQP(nq);
1118 	struct nvmf_tcp_capsule *ntc, *tc;
1119 	struct nvmf_tcp_command_buffer *ncb, *cb;
1120 
1121 	TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
1122 		TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1123 		nvmf_free_capsule(&tc->nc);
1124 	}
1125 	LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
1126 		tcp_free_command_buffer(cb);
1127 	}
1128 	LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
1129 		tcp_free_command_buffer(cb);
1130 	}
1131 	free(qp);
1132 }
1133 
1134 static void
tcp_kernel_handoff_params(struct nvmf_qpair * nq,nvlist_t * nvl)1135 tcp_kernel_handoff_params(struct nvmf_qpair *nq, nvlist_t *nvl)
1136 {
1137 	struct nvmf_tcp_qpair *qp = TQP(nq);
1138 
1139 	nvlist_add_number(nvl, "fd", qp->s);
1140 	nvlist_add_number(nvl, "rxpda", qp->rxpda);
1141 	nvlist_add_number(nvl, "txpda", qp->txpda);
1142 	nvlist_add_bool(nvl, "header_digests", qp->header_digests);
1143 	nvlist_add_bool(nvl, "data_digests", qp->data_digests);
1144 	nvlist_add_number(nvl, "maxr2t", qp->maxr2t);
1145 	nvlist_add_number(nvl, "maxh2cdata", qp->maxh2cdata);
1146 	nvlist_add_number(nvl, "max_icd", qp->max_icd);
1147 }
1148 
1149 static int
tcp_populate_dle(struct nvmf_qpair * nq,struct nvme_discovery_log_entry * dle)1150 tcp_populate_dle(struct nvmf_qpair *nq, struct nvme_discovery_log_entry *dle)
1151 {
1152 	struct nvmf_tcp_qpair *qp = TQP(nq);
1153 	struct sockaddr_storage ss;
1154 	socklen_t ss_len;
1155 
1156 	ss_len = sizeof(ss);
1157 	if (getpeername(qp->s, (struct sockaddr *)&ss, &ss_len) == -1)
1158 		return (errno);
1159 
1160 	if (getnameinfo((struct sockaddr *)&ss, ss_len, dle->traddr,
1161 	    sizeof(dle->traddr), dle->trsvcid, sizeof(dle->trsvcid),
1162 	    NI_NUMERICHOST | NI_NUMERICSERV) != 0)
1163 		return (EINVAL);
1164 
1165 	return (0);
1166 }
1167 
1168 static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * qp __unused)1169 tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
1170 {
1171 	struct nvmf_tcp_capsule *nc;
1172 
1173 	nc = calloc(1, sizeof(*nc));
1174 	return (&nc->nc);
1175 }
1176 
1177 static void
tcp_free_capsule(struct nvmf_capsule * nc)1178 tcp_free_capsule(struct nvmf_capsule *nc)
1179 {
1180 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1181 
1182 	nvmf_tcp_free_pdu(&tc->rx_pdu);
1183 	if (tc->cb != NULL)
1184 		tcp_free_command_buffer(tc->cb);
1185 	free(tc);
1186 }
1187 
1188 static int
tcp_transmit_command(struct nvmf_capsule * nc)1189 tcp_transmit_command(struct nvmf_capsule *nc)
1190 {
1191 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1192 	struct nvmf_tcp_capsule *tc = TCAP(nc);
1193 	struct nvme_tcp_cmd cmd;
1194 	struct nvme_sgl_descriptor *sgl;
1195 	int error;
1196 	bool use_icd;
1197 
1198 	use_icd = false;
1199 	if (nc->nc_data_len != 0 && nc->nc_send_data &&
1200 	    nc->nc_data_len <= qp->max_icd)
1201 		use_icd = true;
1202 
1203 	memset(&cmd, 0, sizeof(cmd));
1204 	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1205 	cmd.ccsqe = nc->nc_sqe;
1206 
1207 	/* Populate SGL in SQE. */
1208 	sgl = &cmd.ccsqe.sgl;
1209 	memset(sgl, 0, sizeof(*sgl));
1210 	sgl->address = 0;
1211 	sgl->length = htole32(nc->nc_data_len);
1212 	if (use_icd) {
1213 		/* Use in-capsule data. */
1214 		sgl->type = NVME_SGL_TYPE_ICD;
1215 	} else {
1216 		/* Use a command buffer. */
1217 		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1218 	}
1219 
1220 	/* Send command capsule. */
1221 	error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
1222 	    nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
1223 	if (error != 0)
1224 		return (error);
1225 
1226 	/*
1227 	 * If data will be transferred using a command buffer, allocate a
1228 	 * buffer structure and queue it.
1229 	 */
1230 	if (nc->nc_data_len != 0 && !use_icd)
1231 		tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
1232 		    nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
1233 
1234 	return (0);
1235 }
1236 
1237 static int
tcp_transmit_response(struct nvmf_capsule * nc)1238 tcp_transmit_response(struct nvmf_capsule *nc)
1239 {
1240 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1241 	struct nvme_tcp_rsp rsp;
1242 
1243 	memset(&rsp, 0, sizeof(rsp));
1244 	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1245 	rsp.rccqe = nc->nc_cqe;
1246 
1247 	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1248 }
1249 
1250 static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1251 tcp_transmit_capsule(struct nvmf_capsule *nc)
1252 {
1253 	if (nc->nc_qe_len == sizeof(struct nvme_command))
1254 		return (tcp_transmit_command(nc));
1255 	else
1256 		return (tcp_transmit_response(nc));
1257 }
1258 
1259 static int
tcp_receive_capsule(struct nvmf_qpair * nq,struct nvmf_capsule ** ncp)1260 tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
1261 {
1262 	struct nvmf_tcp_qpair *qp = TQP(nq);
1263 	struct nvmf_tcp_capsule *tc;
1264 	int error;
1265 
1266 	while (TAILQ_EMPTY(&qp->rx_capsules)) {
1267 		error = nvmf_tcp_receive_pdu(qp);
1268 		if (error != 0)
1269 			return (error);
1270 	}
1271 	tc = TAILQ_FIRST(&qp->rx_capsules);
1272 	TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1273 	*ncp = &tc->nc;
1274 	return (0);
1275 }
1276 
1277 static uint8_t
tcp_validate_command_capsule(const struct nvmf_capsule * nc)1278 tcp_validate_command_capsule(const struct nvmf_capsule *nc)
1279 {
1280 	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1281 	const struct nvme_sgl_descriptor *sgl;
1282 
1283 	assert(tc->rx_pdu.hdr != NULL);
1284 
1285 	sgl = &nc->nc_sqe.sgl;
1286 	switch (sgl->type) {
1287 	case NVME_SGL_TYPE_ICD:
1288 		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1289 			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1290 			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1291 		}
1292 		break;
1293 	case NVME_SGL_TYPE_COMMAND_BUFFER:
1294 		if (tc->rx_pdu.data_len != 0) {
1295 			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1296 			return (NVME_SC_INVALID_FIELD);
1297 		}
1298 		break;
1299 	default:
1300 		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1301 		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1302 	}
1303 
1304 	if (sgl->address != 0) {
1305 		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1306 		return (NVME_SC_SGL_OFFSET_INVALID);
1307 	}
1308 
1309 	return (NVME_SC_SUCCESS);
1310 }
1311 
1312 static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1313 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1314 {
1315 	assert(nc->nc_qe_len == sizeof(struct nvme_command));
1316 	return (le32toh(nc->nc_sqe.sgl.length));
1317 }
1318 
1319 /* NB: cid and ttag are both little-endian already. */
1320 static int
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1321 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
1322     uint32_t data_offset, uint32_t data_len)
1323 {
1324 	struct nvme_tcp_r2t_hdr r2t;
1325 
1326 	memset(&r2t, 0, sizeof(r2t));
1327 	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1328 	r2t.cccid = cid;
1329 	r2t.ttag = ttag;
1330 	r2t.r2to = htole32(data_offset);
1331 	r2t.r2tl = htole32(data_len);
1332 
1333 	return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
1334 }
1335 
1336 static int
tcp_receive_r2t_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1337 tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1338     void *buf, size_t len)
1339 {
1340 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1341 	struct nvmf_tcp_command_buffer *cb;
1342 	int error;
1343 	uint16_t ttag;
1344 
1345 	/*
1346 	 * Don't bother byte-swapping ttag as it is just a cookie
1347 	 * value returned by the other end as-is.
1348 	 */
1349 	ttag = qp->next_ttag++;
1350 
1351 	error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
1352 	if (error != 0)
1353 		return (error);
1354 
1355 	cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
1356 	    nc->nc_sqe.cid, ttag, true);
1357 
1358 	/* Parse received PDUs until the data transfer is complete. */
1359 	while (cb->data_xfered < cb->data_len) {
1360 		error = nvmf_tcp_receive_pdu(qp);
1361 		if (error != 0)
1362 			break;
1363 	}
1364 	tcp_free_command_buffer(cb);
1365 	return (error);
1366 }
1367 
1368 static int
tcp_receive_icd_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1369 tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1370     void *buf, size_t len)
1371 {
1372 	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1373 	const char *icd;
1374 
1375 	icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
1376 	memcpy(buf, icd, len);
1377 	return (0);
1378 }
1379 
1380 static int
tcp_receive_controller_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1381 tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1382     void *buf, size_t len)
1383 {
1384 	struct nvmf_association *na = nc->nc_qpair->nq_association;
1385 	const struct nvme_sgl_descriptor *sgl;
1386 	size_t data_len;
1387 
1388 	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1389 		return (EINVAL);
1390 
1391 	sgl = &nc->nc_sqe.sgl;
1392 	data_len = le32toh(sgl->length);
1393 	if (data_offset + len > data_len)
1394 		return (EFBIG);
1395 
1396 	if (sgl->type == NVME_SGL_TYPE_ICD)
1397 		return (tcp_receive_icd_data(nc, data_offset, buf, len));
1398 	else
1399 		return (tcp_receive_r2t_data(nc, data_offset, buf, len));
1400 }
1401 
1402 /* NB: cid is little-endian already. */
1403 static int
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,const void * buf,size_t len,bool last_pdu,bool success)1404 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
1405     uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
1406     bool success)
1407 {
1408 	struct nvme_tcp_c2h_data_hdr c2h;
1409 
1410 	memset(&c2h, 0, sizeof(c2h));
1411 	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1412 	if (last_pdu)
1413 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1414 	if (success)
1415 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1416 	c2h.cccid = cid;
1417 	c2h.datao = htole32(data_offset);
1418 	c2h.datal = htole32(len);
1419 
1420 	return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
1421 	    __DECONST(void *, buf), len));
1422 }
1423 
1424 static int
tcp_send_controller_data(const struct nvmf_capsule * nc,const void * buf,size_t len)1425 tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
1426     size_t len)
1427 {
1428 	struct nvmf_association *na = nc->nc_qpair->nq_association;
1429 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1430 	const struct nvme_sgl_descriptor *sgl;
1431 	const char *src;
1432 	size_t todo;
1433 	uint32_t data_len, data_offset;
1434 	int error;
1435 	bool last_pdu, send_success_flag;
1436 
1437 	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1438 		return (EINVAL);
1439 
1440 	sgl = &nc->nc_sqe.sgl;
1441 	data_len = le32toh(sgl->length);
1442 	if (len != data_len) {
1443 		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1444 		return (EFBIG);
1445 	}
1446 
1447 	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1448 		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1449 		return (EINVAL);
1450 	}
1451 
1452 	/* Use the SUCCESS flag if SQ flow control is disabled. */
1453 	send_success_flag = !qp->qp.nq_flow_control;
1454 
1455 	/*
1456 	 * Write out one or more C2H_DATA PDUs containing the data.
1457 	 * Each PDU is arbitrarily capped at 256k.
1458 	 */
1459 	data_offset = 0;
1460 	src = buf;
1461 	while (len > 0) {
1462 		if (len > 256 * 1024) {
1463 			todo = 256 * 1024;
1464 			last_pdu = false;
1465 		} else {
1466 			todo = len;
1467 			last_pdu = true;
1468 		}
1469 		error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
1470 		    src, todo, last_pdu, last_pdu && send_success_flag);
1471 		if (error != 0) {
1472 			nvmf_send_generic_error(nc,
1473 			    NVME_SC_TRANSIENT_TRANSPORT_ERROR);
1474 			return (error);
1475 		}
1476 		data_offset += todo;
1477 		src += todo;
1478 		len -= todo;
1479 	}
1480 	if (!send_success_flag)
1481 		nvmf_send_success(nc);
1482 	return (0);
1483 }
1484 
1485 struct nvmf_transport_ops tcp_ops = {
1486 	.allocate_association = tcp_allocate_association,
1487 	.update_association = tcp_update_association,
1488 	.free_association = tcp_free_association,
1489 	.allocate_qpair = tcp_allocate_qpair,
1490 	.free_qpair = tcp_free_qpair,
1491 	.kernel_handoff_params = tcp_kernel_handoff_params,
1492 	.populate_dle = tcp_populate_dle,
1493 	.allocate_capsule = tcp_allocate_capsule,
1494 	.free_capsule = tcp_free_capsule,
1495 	.transmit_capsule = tcp_transmit_capsule,
1496 	.receive_capsule = tcp_receive_capsule,
1497 	.validate_command_capsule = tcp_validate_command_capsule,
1498 	.capsule_data_len = tcp_capsule_data_len,
1499 	.receive_controller_data = tcp_receive_controller_data,
1500 	.send_controller_data = tcp_send_controller_data,
1501 };
1502