xref: /freebsd/lib/libnvmf/nvmf_tcp.c (revision 8bba2c0f8958443790b1f3abc0675719da987e87)
1  /*-
2   * SPDX-License-Identifier: BSD-2-Clause
3   *
4   * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5   * Written by: John Baldwin <jhb@FreeBSD.org>
6   */
7  
8  #include <sys/endian.h>
9  #include <sys/gsb_crc32.h>
10  #include <sys/queue.h>
11  #include <sys/socket.h>
12  #include <sys/uio.h>
13  #include <assert.h>
14  #include <errno.h>
15  #include <netdb.h>
16  #include <stdio.h>
17  #include <stdlib.h>
18  #include <string.h>
19  #include <unistd.h>
20  
21  #include "libnvmf.h"
22  #include "internal.h"
23  #include "nvmf_tcp.h"
24  
25  struct nvmf_tcp_qpair;
26  
27  struct nvmf_tcp_command_buffer {
28  	struct nvmf_tcp_qpair *qp;
29  
30  	void	*data;
31  	size_t	data_len;
32  	size_t	data_xfered;
33  	uint32_t data_offset;
34  
35  	uint16_t cid;
36  	uint16_t ttag;
37  
38  	LIST_ENTRY(nvmf_tcp_command_buffer) link;
39  };
40  
41  LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
42  
43  struct nvmf_tcp_association {
44  	struct nvmf_association na;
45  
46  	uint32_t ioccsz;
47  };
48  
49  struct nvmf_tcp_rxpdu {
50  	struct nvme_tcp_common_pdu_hdr *hdr;
51  	uint32_t data_len;
52  };
53  
54  struct nvmf_tcp_capsule {
55  	struct nvmf_capsule nc;
56  
57  	struct nvmf_tcp_rxpdu rx_pdu;
58  	struct nvmf_tcp_command_buffer *cb;
59  
60  	TAILQ_ENTRY(nvmf_tcp_capsule) link;
61  };
62  
63  struct nvmf_tcp_qpair {
64  	struct nvmf_qpair qp;
65  	int s;
66  
67  	uint8_t	txpda;
68  	uint8_t rxpda;
69  	bool header_digests;
70  	bool data_digests;
71  	uint32_t maxr2t;
72  	uint32_t maxh2cdata;
73  	uint32_t max_icd;	/* Host only */
74  	uint16_t next_ttag;	/* Controller only */
75  
76  	struct nvmf_tcp_command_buffer_list tx_buffers;
77  	struct nvmf_tcp_command_buffer_list rx_buffers;
78  	TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
79  };
80  
81  #define	TASSOC(nc)	((struct nvmf_tcp_association *)(na))
82  #define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
83  #define	CTCAP(nc)	((const struct nvmf_tcp_capsule *)(nc))
84  #define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
85  
86  static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
87  
88  static uint32_t
compute_digest(const void * buf,size_t len)89  compute_digest(const void *buf, size_t len)
90  {
91  	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
92  }
93  
94  static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,void * data,uint32_t data_offset,size_t data_len,uint16_t cid,uint16_t ttag,bool receive)95  tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
96      uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
97      bool receive)
98  {
99  	struct nvmf_tcp_command_buffer *cb;
100  
101  	cb = malloc(sizeof(*cb));
102  	cb->qp = qp;
103  	cb->data = data;
104  	cb->data_offset = data_offset;
105  	cb->data_len = data_len;
106  	cb->data_xfered = 0;
107  	cb->cid = cid;
108  	cb->ttag = ttag;
109  
110  	if (receive)
111  		LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
112  	else
113  		LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
114  	return (cb);
115  }
116  
117  static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)118  tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
119      bool receive)
120  {
121  	struct nvmf_tcp_command_buffer_list *list;
122  	struct nvmf_tcp_command_buffer *cb;
123  
124  	list = receive ? &qp->rx_buffers : &qp->tx_buffers;
125  	LIST_FOREACH(cb, list, link) {
126  		if (cb->cid == cid && cb->ttag == ttag)
127  			return (cb);
128  	}
129  	return (NULL);
130  }
131  
132  static void
tcp_purge_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)133  tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
134      bool receive)
135  {
136  	struct nvmf_tcp_command_buffer *cb;
137  
138  	cb = tcp_find_command_buffer(qp, cid, ttag, receive);
139  	if (cb != NULL)
140  		LIST_REMOVE(cb, link);
141  }
142  
143  static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)144  tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
145  {
146  	LIST_REMOVE(cb, link);
147  	free(cb);
148  }
149  
150  static int
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,const void * pdu,size_t len)151  nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
152  {
153  	ssize_t nwritten;
154  	const char *cp;
155  
156  	cp = pdu;
157  	while (len != 0) {
158  		nwritten = write(qp->s, cp, len);
159  		if (nwritten < 0)
160  			return (errno);
161  		len -= nwritten;
162  		cp += nwritten;
163  	}
164  	return (0);
165  }
166  
167  static int
nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair * qp,struct iovec * iov,u_int iovcnt,size_t len)168  nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
169      u_int iovcnt, size_t len)
170  {
171  	ssize_t nwritten;
172  
173  	for (;;) {
174  		nwritten = writev(qp->s, iov, iovcnt);
175  		if (nwritten < 0)
176  			return (errno);
177  
178  		len -= nwritten;
179  		if (len == 0)
180  			return (0);
181  
182  		while (iov->iov_len <= (size_t)nwritten) {
183  			nwritten -= iov->iov_len;
184  			iovcnt--;
185  			iov++;
186  		}
187  
188  		iov->iov_base = (char *)iov->iov_base + nwritten;
189  		iov->iov_len -= nwritten;
190  	}
191  }
192  
193  static void
nvmf_tcp_report_error(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,const void * rx_pdu,size_t pdu_len,u_int hlen)194  nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
195      uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
196  {
197  	struct nvme_tcp_term_req_hdr hdr;
198  	struct iovec iov[2];
199  
200  	if (hlen != 0) {
201  		if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
202  			hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
203  		if (hlen > pdu_len)
204  			hlen = pdu_len;
205  	}
206  
207  	memset(&hdr, 0, sizeof(hdr));
208  	hdr.common.pdu_type = na->na_controller ?
209  	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
210  	hdr.common.hlen = sizeof(hdr);
211  	hdr.common.plen = sizeof(hdr) + hlen;
212  	hdr.fes = htole16(fes);
213  	le32enc(hdr.fei, fei);
214  	iov[0].iov_base = &hdr;
215  	iov[0].iov_len = sizeof(hdr);
216  	iov[1].iov_base = __DECONST(void *, rx_pdu);
217  	iov[1].iov_len = hlen;
218  
219  	(void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
220  	close(qp->s);
221  	qp->s = -1;
222  }
223  
224  static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu,size_t pdu_len)225  nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
226      size_t pdu_len)
227  {
228  	const struct nvme_tcp_common_pdu_hdr *ch;
229  	uint32_t data_len, fei, plen;
230  	uint32_t digest, rx_digest;
231  	u_int hlen;
232  	int error;
233  	uint16_t fes;
234  
235  	/* Determine how large of a PDU header to return for errors. */
236  	ch = pdu->hdr;
237  	hlen = ch->hlen;
238  	plen = le32toh(ch->plen);
239  	if (hlen < sizeof(*ch) || hlen > plen)
240  		hlen = sizeof(*ch);
241  
242  	error = nvmf_tcp_validate_pdu_header(ch,
243  	    qp->qp.nq_association->na_controller, qp->header_digests,
244  	    qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
245  	if (error != 0) {
246  		if (error == ECONNRESET) {
247  			close(qp->s);
248  			qp->s = -1;
249  		} else {
250  			nvmf_tcp_report_error(qp->qp.nq_association, qp,
251  			    fes, fei, ch, pdu_len, hlen);
252  		}
253  		return (error);
254  	}
255  
256  	/* Check header digest if present. */
257  	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
258  		digest = compute_digest(ch, ch->hlen);
259  		memcpy(&rx_digest, (const char *)ch + ch->hlen,
260  		    sizeof(rx_digest));
261  		if (digest != rx_digest) {
262  			printf("NVMe/TCP: Header digest mismatch\n");
263  			nvmf_tcp_report_error(qp->qp.nq_association, qp,
264  			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
265  			    pdu_len, hlen);
266  			return (EBADMSG);
267  		}
268  	}
269  
270  	/* Check data digest if present. */
271  	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
272  		digest = compute_digest((const char *)ch + ch->pdo, data_len);
273  		memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
274  		    sizeof(rx_digest));
275  		if (digest != rx_digest) {
276  			printf("NVMe/TCP: Data digest mismatch\n");
277  			return (EBADMSG);
278  		}
279  	}
280  
281  	pdu->data_len = data_len;
282  	return (0);
283  }
284  
285  /*
286   * Read data from a socket, retrying until the data has been fully
287   * read or an error occurs.
288   */
289  static int
nvmf_tcp_read_buffer(int s,void * buf,size_t len)290  nvmf_tcp_read_buffer(int s, void *buf, size_t len)
291  {
292  	ssize_t nread;
293  	char *cp;
294  
295  	cp = buf;
296  	while (len != 0) {
297  		nread = read(s, cp, len);
298  		if (nread < 0)
299  			return (errno);
300  		if (nread == 0)
301  			return (ECONNRESET);
302  		len -= nread;
303  		cp += nread;
304  	}
305  	return (0);
306  }
307  
308  static int
nvmf_tcp_read_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)309  nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
310  {
311  	struct nvme_tcp_common_pdu_hdr ch;
312  	uint32_t plen;
313  	int error;
314  
315  	memset(pdu, 0, sizeof(*pdu));
316  	error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
317  	if (error != 0)
318  		return (error);
319  
320  	plen = le32toh(ch.plen);
321  
322  	/*
323  	 * Validate a header with garbage lengths to trigger
324  	 * an error message without reading more.
325  	 */
326  	if (plen < sizeof(ch) || ch.hlen > plen) {
327  		pdu->hdr = &ch;
328  		error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
329  		pdu->hdr = NULL;
330  		assert(error != 0);
331  		return (error);
332  	}
333  
334  	/* Read the rest of the PDU. */
335  	pdu->hdr = malloc(plen);
336  	memcpy(pdu->hdr, &ch, sizeof(ch));
337  	error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
338  	if (error != 0)
339  		return (error);
340  	error = nvmf_tcp_validate_pdu(qp, pdu, plen);
341  	if (error != 0) {
342  		free(pdu->hdr);
343  		pdu->hdr = NULL;
344  	}
345  	return (error);
346  }
347  
348  static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)349  nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
350  {
351  	free(pdu->hdr);
352  	pdu->hdr = NULL;
353  }
354  
355  static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)356  nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
357  {
358  	struct nvme_tcp_term_req_hdr *hdr;
359  
360  	hdr = (void *)pdu->hdr;
361  
362  	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
363  	    le16toh(hdr->fes), le32dec(hdr->fei));
364  	nvmf_tcp_free_pdu(pdu);
365  	return (ECONNRESET);
366  }
367  
368  static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)369  nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
370      struct nvmf_tcp_rxpdu *pdu)
371  {
372  	struct nvme_tcp_cmd *cmd;
373  	struct nvmf_capsule *nc;
374  	struct nvmf_tcp_capsule *tc;
375  
376  	cmd = (void *)pdu->hdr;
377  
378  	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
379  	if (nc == NULL)
380  		return (ENOMEM);
381  
382  	tc = TCAP(nc);
383  	tc->rx_pdu = *pdu;
384  
385  	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
386  	return (0);
387  }
388  
389  static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)390  nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
391      struct nvmf_tcp_rxpdu *pdu)
392  {
393  	struct nvme_tcp_rsp *rsp;
394  	struct nvmf_capsule *nc;
395  	struct nvmf_tcp_capsule *tc;
396  
397  	rsp = (void *)pdu->hdr;
398  
399  	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
400  	if (nc == NULL)
401  		return (ENOMEM);
402  
403  	nc->nc_sqhd_valid = true;
404  	tc = TCAP(nc);
405  	tc->rx_pdu = *pdu;
406  
407  	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
408  
409  	/*
410  	 * Once the CQE has been received, no further transfers to the
411  	 * command buffer for the associated CID can occur.
412  	 */
413  	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
414  	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
415  
416  	return (0);
417  }
418  
419  /*
420   * Construct and send a PDU that contains an optional data payload.
421   * This includes dealing with digests and the length fields in the
422   * common header.
423   */
424  static int
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,void * data,uint32_t data_len)425  nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
426      void *data, uint32_t data_len)
427  {
428  	struct nvme_tcp_common_pdu_hdr *ch;
429  	struct iovec iov[5];
430  	u_int iovcnt;
431  	uint32_t header_digest, data_digest, pad, pdo, plen;
432  
433  	plen = hlen;
434  	if (qp->header_digests)
435  		plen += sizeof(header_digest);
436  	if (data_len != 0) {
437  		pdo = roundup(plen, qp->txpda);
438  		pad = pdo - plen;
439  		plen = pdo + data_len;
440  		if (qp->data_digests)
441  			plen += sizeof(data_digest);
442  	} else {
443  		assert(data == NULL);
444  		pdo = 0;
445  		pad = 0;
446  	}
447  
448  	ch = hdr;
449  	ch->hlen = hlen;
450  	if (qp->header_digests)
451  		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
452  	if (qp->data_digests && data_len != 0)
453  		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
454  	ch->pdo = pdo;
455  	ch->plen = htole32(plen);
456  
457  	/* CH + PSH */
458  	iov[0].iov_base = hdr;
459  	iov[0].iov_len = hlen;
460  	iovcnt = 1;
461  
462  	/* HDGST */
463  	if (qp->header_digests) {
464  		header_digest = compute_digest(hdr, hlen);
465  		iov[iovcnt].iov_base = &header_digest;
466  		iov[iovcnt].iov_len = sizeof(header_digest);
467  		iovcnt++;
468  	}
469  
470  	if (pad != 0) {
471  		/* PAD */
472  		iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
473  		iov[iovcnt].iov_len = pad;
474  		iovcnt++;
475  	}
476  
477  	if (data_len != 0) {
478  		/* DATA */
479  		iov[iovcnt].iov_base = data;
480  		iov[iovcnt].iov_len = data_len;
481  		iovcnt++;
482  
483  		/* DDGST */
484  		if (qp->data_digests) {
485  			data_digest = compute_digest(data, data_len);
486  			iov[iovcnt].iov_base = &data_digest;
487  			iov[iovcnt].iov_len = sizeof(data_digest);
488  			iovcnt++;
489  		}
490  	}
491  
492  	return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
493  }
494  
495  static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)496  nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
497  {
498  	struct nvme_tcp_h2c_data_hdr *h2c;
499  	struct nvmf_tcp_command_buffer *cb;
500  	uint32_t data_len, data_offset;
501  	const char *icd;
502  
503  	h2c = (void *)pdu->hdr;
504  	if (le32toh(h2c->datal) > qp->maxh2cdata) {
505  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
506  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
507  		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
508  		nvmf_tcp_free_pdu(pdu);
509  		return (EBADMSG);
510  	}
511  
512  	cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
513  	if (cb == NULL) {
514  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
515  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
516  		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
517  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
518  		nvmf_tcp_free_pdu(pdu);
519  		return (EBADMSG);
520  	}
521  
522  	data_len = le32toh(h2c->datal);
523  	if (data_len != pdu->data_len) {
524  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
525  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
526  		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
527  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
528  		nvmf_tcp_free_pdu(pdu);
529  		return (EBADMSG);
530  	}
531  
532  	data_offset = le32toh(h2c->datao);
533  	if (data_offset < cb->data_offset ||
534  	    data_offset + data_len > cb->data_offset + cb->data_len) {
535  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
536  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
537  		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
538  		nvmf_tcp_free_pdu(pdu);
539  		return (EBADMSG);
540  	}
541  
542  	if (data_offset != cb->data_offset + cb->data_xfered) {
543  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
544  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
545  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
546  		nvmf_tcp_free_pdu(pdu);
547  		return (EBADMSG);
548  	}
549  
550  	if ((cb->data_xfered + data_len == cb->data_len) !=
551  	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
552  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
553  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
554  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
555  		nvmf_tcp_free_pdu(pdu);
556  		return (EBADMSG);
557  	}
558  
559  	cb->data_xfered += data_len;
560  	data_offset -= cb->data_offset;
561  	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
562  	memcpy((char *)cb->data + data_offset, icd, data_len);
563  
564  	nvmf_tcp_free_pdu(pdu);
565  	return (0);
566  }
567  
568  static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)569  nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
570  {
571  	struct nvme_tcp_c2h_data_hdr *c2h;
572  	struct nvmf_tcp_command_buffer *cb;
573  	uint32_t data_len, data_offset;
574  	const char *icd;
575  
576  	c2h = (void *)pdu->hdr;
577  
578  	cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
579  	if (cb == NULL) {
580  		/*
581  		 * XXX: Could be PDU sequence error if cccid is for a
582  		 * command that doesn't use a command buffer.
583  		 */
584  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
585  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
586  		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
587  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
588  		nvmf_tcp_free_pdu(pdu);
589  		return (EBADMSG);
590  	}
591  
592  	data_len = le32toh(c2h->datal);
593  	if (data_len != pdu->data_len) {
594  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
595  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
596  		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
597  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
598  		nvmf_tcp_free_pdu(pdu);
599  		return (EBADMSG);
600  	}
601  
602  	data_offset = le32toh(c2h->datao);
603  	if (data_offset < cb->data_offset ||
604  	    data_offset + data_len > cb->data_offset + cb->data_len) {
605  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
606  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
607  		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
608  		nvmf_tcp_free_pdu(pdu);
609  		return (EBADMSG);
610  	}
611  
612  	if (data_offset != cb->data_offset + cb->data_xfered) {
613  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
614  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
615  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
616  		nvmf_tcp_free_pdu(pdu);
617  		return (EBADMSG);
618  	}
619  
620  	if ((cb->data_xfered + data_len == cb->data_len) !=
621  	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
622  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
623  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
624  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
625  		nvmf_tcp_free_pdu(pdu);
626  		return (EBADMSG);
627  	}
628  
629  	cb->data_xfered += data_len;
630  	data_offset -= cb->data_offset;
631  	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
632  	memcpy((char *)cb->data + data_offset, icd, data_len);
633  
634  	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
635  		struct nvme_completion cqe;
636  		struct nvmf_tcp_capsule *tc;
637  		struct nvmf_capsule *nc;
638  
639  		memset(&cqe, 0, sizeof(cqe));
640  		cqe.cid = cb->cid;
641  
642  		nc = nvmf_allocate_response(&qp->qp, &cqe);
643  		if (nc == NULL) {
644  			nvmf_tcp_free_pdu(pdu);
645  			return (ENOMEM);
646  		}
647  		nc->nc_sqhd_valid = false;
648  
649  		tc = TCAP(nc);
650  		TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
651  	}
652  
653  	nvmf_tcp_free_pdu(pdu);
654  	return (0);
655  }
656  
657  /* NB: cid and ttag and little-endian already. */
658  static int
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)659  tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
660      uint32_t data_offset, void *buf, size_t len, bool last_pdu)
661  {
662  	struct nvme_tcp_h2c_data_hdr h2c;
663  
664  	memset(&h2c, 0, sizeof(h2c));
665  	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
666  	if (last_pdu)
667  		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
668  	h2c.cccid = cid;
669  	h2c.ttag = ttag;
670  	h2c.datao = htole32(data_offset);
671  	h2c.datal = htole32(len);
672  
673  	return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
674  }
675  
676  /* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
677  static int
tcp_send_h2c_pdus(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)678  tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
679      uint32_t data_offset, void *buf, size_t len, bool last_pdu)
680  {
681  	char *p;
682  
683  	p = buf;
684  	while (len != 0) {
685  		size_t todo;
686  		int error;
687  
688  		todo = len;
689  		if (todo > qp->maxh2cdata)
690  			todo = qp->maxh2cdata;
691  		error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
692  		    last_pdu && todo == len);
693  		if (error != 0)
694  			return (error);
695  		p += todo;
696  		len -= todo;
697  	}
698  	return (0);
699  }
700  
701  static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)702  nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
703  {
704  	struct nvmf_tcp_command_buffer *cb;
705  	struct nvme_tcp_r2t_hdr *r2t;
706  	uint32_t data_len, data_offset;
707  	int error;
708  
709  	r2t = (void *)pdu->hdr;
710  
711  	cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
712  	if (cb == NULL) {
713  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
714  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
715  		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
716  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
717  		nvmf_tcp_free_pdu(pdu);
718  		return (EBADMSG);
719  	}
720  
721  	data_offset = le32toh(r2t->r2to);
722  	if (data_offset != cb->data_xfered) {
723  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
724  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
725  		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
726  		nvmf_tcp_free_pdu(pdu);
727  		return (EBADMSG);
728  	}
729  
730  	/*
731  	 * XXX: The spec does not specify how to handle R2T tranfers
732  	 * out of range of the original command.
733  	 */
734  	data_len = le32toh(r2t->r2tl);
735  	if (data_offset + data_len > cb->data_len) {
736  		nvmf_tcp_report_error(qp->qp.nq_association, qp,
737  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
738  		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
739  		nvmf_tcp_free_pdu(pdu);
740  		return (EBADMSG);
741  	}
742  
743  	cb->data_xfered += data_len;
744  
745  	/*
746  	 * Write out one or more H2C_DATA PDUs containing the
747  	 * requested data.
748  	 */
749  	error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
750  	    data_offset, (char *)cb->data + data_offset, data_len, true);
751  
752  	nvmf_tcp_free_pdu(pdu);
753  	return (error);
754  }
755  
756  static int
nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair * qp)757  nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
758  {
759  	struct nvmf_tcp_rxpdu pdu;
760  	int error;
761  
762  	error = nvmf_tcp_read_pdu(qp, &pdu);
763  	if (error != 0)
764  		return (error);
765  
766  	switch (pdu.hdr->pdu_type) {
767  	default:
768  		__unreachable();
769  		break;
770  	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
771  	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
772  		return (nvmf_tcp_handle_term_req(&pdu));
773  	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
774  		return (nvmf_tcp_save_command_capsule(qp, &pdu));
775  	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
776  		return (nvmf_tcp_save_response_capsule(qp, &pdu));
777  	case NVME_TCP_PDU_TYPE_H2C_DATA:
778  		return (nvmf_tcp_handle_h2c_data(qp, &pdu));
779  	case NVME_TCP_PDU_TYPE_C2H_DATA:
780  		return (nvmf_tcp_handle_c2h_data(qp, &pdu));
781  	case NVME_TCP_PDU_TYPE_R2T:
782  		return (nvmf_tcp_handle_r2t(qp, &pdu));
783  	}
784  }
785  
786  static bool
nvmf_tcp_validate_ic_pdu(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,size_t pdu_len)787  nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
788      const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
789  {
790  	const struct nvme_tcp_ic_req *pdu;
791  	uint32_t plen;
792  	u_int hlen;
793  
794  	/* Determine how large of a PDU header to return for errors. */
795  	hlen = ch->hlen;
796  	plen = le32toh(ch->plen);
797  	if (hlen < sizeof(*ch) || hlen > plen)
798  		hlen = sizeof(*ch);
799  
800  	/*
801  	 * Errors must be reported for the lowest incorrect field
802  	 * first, so validate fields in order.
803  	 */
804  
805  	/* Validate pdu_type. */
806  
807  	/* Controllers only receive PDUs with a PDU direction of 0. */
808  	if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) {
809  		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
810  		nvmf_tcp_report_error(na, qp,
811  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
812  		    hlen);
813  		return (false);
814  	}
815  
816  	switch (ch->pdu_type) {
817  	case NVME_TCP_PDU_TYPE_IC_REQ:
818  	case NVME_TCP_PDU_TYPE_IC_RESP:
819  		break;
820  	default:
821  		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
822  		nvmf_tcp_report_error(na, qp,
823  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
824  		    hlen);
825  		return (false);
826  	}
827  
828  	/* Validate flags. */
829  	if (ch->flags != 0) {
830  		na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
831  		    ch->flags);
832  		nvmf_tcp_report_error(na, qp,
833  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
834  		    hlen);
835  		return (false);
836  	}
837  
838  	/* Validate hlen. */
839  	if (ch->hlen != 128) {
840  		na_error(na, "NVMe/TCP: Invalid PDU header length %u",
841  		    ch->hlen);
842  		nvmf_tcp_report_error(na, qp,
843  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
844  		    hlen);
845  		return (false);
846  	}
847  
848  	/* Validate pdo. */
849  	if (ch->pdo != 0) {
850  		na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
851  		nvmf_tcp_report_error(na, qp,
852  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
853  		    hlen);
854  		return (false);
855  	}
856  
857  	/* Validate plen. */
858  	if (plen != 128) {
859  		na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
860  		nvmf_tcp_report_error(na, qp,
861  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
862  		    hlen);
863  		return (false);
864  	}
865  
866  	/* Validate fields common to both ICReq and ICResp. */
867  	pdu = (const struct nvme_tcp_ic_req *)ch;
868  	if (le16toh(pdu->pfv) != 0) {
869  		na_error(na, "NVMe/TCP: Unsupported PDU version %u",
870  		    le16toh(pdu->pfv));
871  		nvmf_tcp_report_error(na, qp,
872  		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
873  		    8, ch, pdu_len, hlen);
874  		return (false);
875  	}
876  
877  	if (pdu->hpda > NVME_TCP_HPDA_MAX) {
878  		na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
879  		nvmf_tcp_report_error(na, qp,
880  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
881  		    hlen);
882  		return (false);
883  	}
884  
885  	if (pdu->dgst.bits.reserved != 0) {
886  		na_error(na, "NVMe/TCP: Invalid digest settings");
887  		nvmf_tcp_report_error(na, qp,
888  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
889  		    hlen);
890  		return (false);
891  	}
892  
893  	return (true);
894  }
895  
896  static bool
nvmf_tcp_read_ic_req(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_req * pdu)897  nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
898      struct nvme_tcp_ic_req *pdu)
899  {
900  	int error;
901  
902  	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
903  	if (error != 0) {
904  		na_error(na, "NVMe/TCP: Failed to read IC request: %s",
905  		    strerror(error));
906  		return (false);
907  	}
908  
909  	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
910  }
911  
912  static bool
nvmf_tcp_read_ic_resp(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_resp * pdu)913  nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
914      struct nvme_tcp_ic_resp *pdu)
915  {
916  	int error;
917  
918  	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
919  	if (error != 0) {
920  		na_error(na, "NVMe/TCP: Failed to read IC response: %s",
921  		    strerror(error));
922  		return (false);
923  	}
924  
925  	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
926  }
927  
928  static struct nvmf_association *
tcp_allocate_association(bool controller,const struct nvmf_association_params * params)929  tcp_allocate_association(bool controller,
930      const struct nvmf_association_params *params)
931  {
932  	struct nvmf_tcp_association *ta;
933  
934  	if (controller) {
935  		/* 7.4.10.3 */
936  		if (params->tcp.maxh2cdata < 4096 ||
937  		    params->tcp.maxh2cdata % 4 != 0)
938  			return (NULL);
939  	}
940  
941  	ta = calloc(1, sizeof(*ta));
942  
943  	return (&ta->na);
944  }
945  
946  static void
tcp_update_association(struct nvmf_association * na,const struct nvme_controller_data * cdata)947  tcp_update_association(struct nvmf_association *na,
948      const struct nvme_controller_data *cdata)
949  {
950  	struct nvmf_tcp_association *ta = TASSOC(na);
951  
952  	ta->ioccsz = le32toh(cdata->ioccsz);
953  }
954  
955  static void
tcp_free_association(struct nvmf_association * na)956  tcp_free_association(struct nvmf_association *na)
957  {
958  	free(na);
959  }
960  
961  static bool
tcp_connect(struct nvmf_tcp_qpair * qp,struct nvmf_association * na,bool admin)962  tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
963  {
964  	const struct nvmf_association_params *params = &na->na_params;
965  	struct nvmf_tcp_association *ta = TASSOC(na);
966  	struct nvme_tcp_ic_req ic_req;
967  	struct nvme_tcp_ic_resp ic_resp;
968  	uint32_t maxh2cdata;
969  	int error;
970  
971  	if (!admin) {
972  		if (ta->ioccsz == 0) {
973  			na_error(na, "TCP I/O queues require cdata");
974  			return (false);
975  		}
976  		if (ta->ioccsz < 4) {
977  			na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
978  			return (false);
979  		}
980  	}
981  
982  	memset(&ic_req, 0, sizeof(ic_req));
983  	ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
984  	ic_req.common.hlen = sizeof(ic_req);
985  	ic_req.common.plen = htole32(sizeof(ic_req));
986  	ic_req.pfv = htole16(0);
987  	ic_req.hpda = params->tcp.pda;
988  	if (params->tcp.header_digests)
989  		ic_req.dgst.bits.hdgst_enable = 1;
990  	if (params->tcp.data_digests)
991  		ic_req.dgst.bits.ddgst_enable = 1;
992  	ic_req.maxr2t = htole32(params->tcp.maxr2t);
993  
994  	error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
995  	if (error != 0) {
996  		na_error(na, "Failed to write IC request: %s", strerror(error));
997  		return (false);
998  	}
999  
1000  	if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
1001  		return (false);
1002  
1003  	/* Ensure the controller didn't enable digests we didn't request. */
1004  	if ((!params->tcp.header_digests &&
1005  	    ic_resp.dgst.bits.hdgst_enable != 0) ||
1006  	    (!params->tcp.data_digests &&
1007  	    ic_resp.dgst.bits.ddgst_enable != 0)) {
1008  		na_error(na, "Controller enabled unrequested digests");
1009  		nvmf_tcp_report_error(na, qp,
1010  		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
1011  		    11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
1012  		return (false);
1013  	}
1014  
1015  	/*
1016  	 * XXX: Is there an upper-bound to enforce here?  Perhaps pick
1017  	 * some large value and report larger values as an unsupported
1018  	 * parameter?
1019  	 */
1020  	maxh2cdata = le32toh(ic_resp.maxh2cdata);
1021  	if (maxh2cdata < 4096 || maxh2cdata % 4 != 0) {
1022  		na_error(na, "Invalid MAXH2CDATA %u", maxh2cdata);
1023  		nvmf_tcp_report_error(na, qp,
1024  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
1025  		    sizeof(ic_resp), sizeof(ic_resp));
1026  		return (false);
1027  	}
1028  
1029  	qp->rxpda = (params->tcp.pda + 1) * 4;
1030  	qp->txpda = (ic_resp.cpda + 1) * 4;
1031  	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1032  	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1033  	qp->maxr2t = params->tcp.maxr2t;
1034  	qp->maxh2cdata = maxh2cdata;
1035  	if (admin)
1036  		/* 7.4.3 */
1037  		qp->max_icd = 8192;
1038  	else
1039  		qp->max_icd = (ta->ioccsz - 4) * 16;
1040  
1041  	return (0);
1042  }
1043  
1044  static bool
tcp_accept(struct nvmf_tcp_qpair * qp,struct nvmf_association * na)1045  tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
1046  {
1047  	const struct nvmf_association_params *params = &na->na_params;
1048  	struct nvme_tcp_ic_req ic_req;
1049  	struct nvme_tcp_ic_resp ic_resp;
1050  	int error;
1051  
1052  	if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
1053  		return (false);
1054  
1055  	memset(&ic_resp, 0, sizeof(ic_resp));
1056  	ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
1057  	ic_resp.common.hlen = sizeof(ic_req);
1058  	ic_resp.common.plen = htole32(sizeof(ic_req));
1059  	ic_resp.pfv = htole16(0);
1060  	ic_resp.cpda = params->tcp.pda;
1061  	if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
1062  		ic_resp.dgst.bits.hdgst_enable = 1;
1063  	if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
1064  		ic_resp.dgst.bits.ddgst_enable = 1;
1065  	ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
1066  
1067  	error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
1068  	if (error != 0) {
1069  		na_error(na, "Failed to write IC response: %s",
1070  		    strerror(error));
1071  		return (false);
1072  	}
1073  
1074  	qp->rxpda = (params->tcp.pda + 1) * 4;
1075  	qp->txpda = (ic_req.hpda + 1) * 4;
1076  	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1077  	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1078  	qp->maxr2t = le32toh(ic_req.maxr2t);
1079  	qp->maxh2cdata = params->tcp.maxh2cdata;
1080  	qp->max_icd = 0;	/* XXX */
1081  	return (0);
1082  }
1083  
1084  static struct nvmf_qpair *
tcp_allocate_qpair(struct nvmf_association * na,const struct nvmf_qpair_params * qparams)1085  tcp_allocate_qpair(struct nvmf_association *na,
1086      const struct nvmf_qpair_params *qparams)
1087  {
1088  	const struct nvmf_association_params *aparams = &na->na_params;
1089  	struct nvmf_tcp_qpair *qp;
1090  	int error;
1091  
1092  	if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
1093  		na_error(na, "Invalid PDA");
1094  		return (NULL);
1095  	}
1096  
1097  	qp = calloc(1, sizeof(*qp));
1098  	qp->s = qparams->tcp.fd;
1099  	LIST_INIT(&qp->rx_buffers);
1100  	LIST_INIT(&qp->tx_buffers);
1101  	TAILQ_INIT(&qp->rx_capsules);
1102  	if (na->na_controller)
1103  		error = tcp_accept(qp, na);
1104  	else
1105  		error = tcp_connect(qp, na, qparams->admin);
1106  	if (error != 0) {
1107  		free(qp);
1108  		return (NULL);
1109  	}
1110  
1111  	return (&qp->qp);
1112  }
1113  
1114  static void
tcp_free_qpair(struct nvmf_qpair * nq)1115  tcp_free_qpair(struct nvmf_qpair *nq)
1116  {
1117  	struct nvmf_tcp_qpair *qp = TQP(nq);
1118  	struct nvmf_tcp_capsule *ntc, *tc;
1119  	struct nvmf_tcp_command_buffer *ncb, *cb;
1120  
1121  	TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
1122  		TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1123  		nvmf_free_capsule(&tc->nc);
1124  	}
1125  	LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
1126  		tcp_free_command_buffer(cb);
1127  	}
1128  	LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
1129  		tcp_free_command_buffer(cb);
1130  	}
1131  	free(qp);
1132  }
1133  
1134  static void
tcp_kernel_handoff_params(struct nvmf_qpair * nq,nvlist_t * nvl)1135  tcp_kernel_handoff_params(struct nvmf_qpair *nq, nvlist_t *nvl)
1136  {
1137  	struct nvmf_tcp_qpair *qp = TQP(nq);
1138  
1139  	nvlist_add_number(nvl, "fd", qp->s);
1140  	nvlist_add_number(nvl, "rxpda", qp->rxpda);
1141  	nvlist_add_number(nvl, "txpda", qp->txpda);
1142  	nvlist_add_bool(nvl, "header_digests", qp->header_digests);
1143  	nvlist_add_bool(nvl, "data_digests", qp->data_digests);
1144  	nvlist_add_number(nvl, "maxr2t", qp->maxr2t);
1145  	nvlist_add_number(nvl, "maxh2cdata", qp->maxh2cdata);
1146  	nvlist_add_number(nvl, "max_icd", qp->max_icd);
1147  }
1148  
1149  static int
tcp_populate_dle(struct nvmf_qpair * nq,struct nvme_discovery_log_entry * dle)1150  tcp_populate_dle(struct nvmf_qpair *nq, struct nvme_discovery_log_entry *dle)
1151  {
1152  	struct nvmf_tcp_qpair *qp = TQP(nq);
1153  	struct sockaddr_storage ss;
1154  	socklen_t ss_len;
1155  
1156  	ss_len = sizeof(ss);
1157  	if (getpeername(qp->s, (struct sockaddr *)&ss, &ss_len) == -1)
1158  		return (errno);
1159  
1160  	if (getnameinfo((struct sockaddr *)&ss, ss_len, dle->traddr,
1161  	    sizeof(dle->traddr), dle->trsvcid, sizeof(dle->trsvcid),
1162  	    NI_NUMERICHOST | NI_NUMERICSERV) != 0)
1163  		return (EINVAL);
1164  
1165  	return (0);
1166  }
1167  
1168  static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * qp __unused)1169  tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
1170  {
1171  	struct nvmf_tcp_capsule *nc;
1172  
1173  	nc = calloc(1, sizeof(*nc));
1174  	return (&nc->nc);
1175  }
1176  
1177  static void
tcp_free_capsule(struct nvmf_capsule * nc)1178  tcp_free_capsule(struct nvmf_capsule *nc)
1179  {
1180  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1181  
1182  	nvmf_tcp_free_pdu(&tc->rx_pdu);
1183  	if (tc->cb != NULL)
1184  		tcp_free_command_buffer(tc->cb);
1185  	free(tc);
1186  }
1187  
1188  static int
tcp_transmit_command(struct nvmf_capsule * nc)1189  tcp_transmit_command(struct nvmf_capsule *nc)
1190  {
1191  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1192  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1193  	struct nvme_tcp_cmd cmd;
1194  	struct nvme_sgl_descriptor *sgl;
1195  	int error;
1196  	bool use_icd;
1197  
1198  	use_icd = false;
1199  	if (nc->nc_data_len != 0 && nc->nc_send_data &&
1200  	    nc->nc_data_len <= qp->max_icd)
1201  		use_icd = true;
1202  
1203  	memset(&cmd, 0, sizeof(cmd));
1204  	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1205  	cmd.ccsqe = nc->nc_sqe;
1206  
1207  	/* Populate SGL in SQE. */
1208  	sgl = &cmd.ccsqe.sgl;
1209  	memset(sgl, 0, sizeof(*sgl));
1210  	sgl->address = 0;
1211  	sgl->length = htole32(nc->nc_data_len);
1212  	if (use_icd) {
1213  		/* Use in-capsule data. */
1214  		sgl->type = NVME_SGL_TYPE_ICD;
1215  	} else {
1216  		/* Use a command buffer. */
1217  		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1218  	}
1219  
1220  	/* Send command capsule. */
1221  	error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
1222  	    nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
1223  	if (error != 0)
1224  		return (error);
1225  
1226  	/*
1227  	 * If data will be transferred using a command buffer, allocate a
1228  	 * buffer structure and queue it.
1229  	 */
1230  	if (nc->nc_data_len != 0 && !use_icd)
1231  		tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
1232  		    nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
1233  
1234  	return (0);
1235  }
1236  
1237  static int
tcp_transmit_response(struct nvmf_capsule * nc)1238  tcp_transmit_response(struct nvmf_capsule *nc)
1239  {
1240  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1241  	struct nvme_tcp_rsp rsp;
1242  
1243  	memset(&rsp, 0, sizeof(rsp));
1244  	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1245  	rsp.rccqe = nc->nc_cqe;
1246  
1247  	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1248  }
1249  
1250  static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1251  tcp_transmit_capsule(struct nvmf_capsule *nc)
1252  {
1253  	if (nc->nc_qe_len == sizeof(struct nvme_command))
1254  		return (tcp_transmit_command(nc));
1255  	else
1256  		return (tcp_transmit_response(nc));
1257  }
1258  
1259  static int
tcp_receive_capsule(struct nvmf_qpair * nq,struct nvmf_capsule ** ncp)1260  tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
1261  {
1262  	struct nvmf_tcp_qpair *qp = TQP(nq);
1263  	struct nvmf_tcp_capsule *tc;
1264  	int error;
1265  
1266  	while (TAILQ_EMPTY(&qp->rx_capsules)) {
1267  		error = nvmf_tcp_receive_pdu(qp);
1268  		if (error != 0)
1269  			return (error);
1270  	}
1271  	tc = TAILQ_FIRST(&qp->rx_capsules);
1272  	TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1273  	*ncp = &tc->nc;
1274  	return (0);
1275  }
1276  
1277  static uint8_t
tcp_validate_command_capsule(const struct nvmf_capsule * nc)1278  tcp_validate_command_capsule(const struct nvmf_capsule *nc)
1279  {
1280  	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1281  	const struct nvme_sgl_descriptor *sgl;
1282  
1283  	assert(tc->rx_pdu.hdr != NULL);
1284  
1285  	sgl = &nc->nc_sqe.sgl;
1286  	switch (sgl->type) {
1287  	case NVME_SGL_TYPE_ICD:
1288  		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1289  			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1290  			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1291  		}
1292  		break;
1293  	case NVME_SGL_TYPE_COMMAND_BUFFER:
1294  		if (tc->rx_pdu.data_len != 0) {
1295  			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1296  			return (NVME_SC_INVALID_FIELD);
1297  		}
1298  		break;
1299  	default:
1300  		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1301  		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1302  	}
1303  
1304  	if (sgl->address != 0) {
1305  		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1306  		return (NVME_SC_SGL_OFFSET_INVALID);
1307  	}
1308  
1309  	return (NVME_SC_SUCCESS);
1310  }
1311  
1312  static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1313  tcp_capsule_data_len(const struct nvmf_capsule *nc)
1314  {
1315  	assert(nc->nc_qe_len == sizeof(struct nvme_command));
1316  	return (le32toh(nc->nc_sqe.sgl.length));
1317  }
1318  
1319  /* NB: cid and ttag are both little-endian already. */
1320  static int
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1321  tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
1322      uint32_t data_offset, uint32_t data_len)
1323  {
1324  	struct nvme_tcp_r2t_hdr r2t;
1325  
1326  	memset(&r2t, 0, sizeof(r2t));
1327  	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1328  	r2t.cccid = cid;
1329  	r2t.ttag = ttag;
1330  	r2t.r2to = htole32(data_offset);
1331  	r2t.r2tl = htole32(data_len);
1332  
1333  	return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
1334  }
1335  
1336  static int
tcp_receive_r2t_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1337  tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1338      void *buf, size_t len)
1339  {
1340  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1341  	struct nvmf_tcp_command_buffer *cb;
1342  	int error;
1343  	uint16_t ttag;
1344  
1345  	/*
1346  	 * Don't bother byte-swapping ttag as it is just a cookie
1347  	 * value returned by the other end as-is.
1348  	 */
1349  	ttag = qp->next_ttag++;
1350  
1351  	error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
1352  	if (error != 0)
1353  		return (error);
1354  
1355  	cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
1356  	    nc->nc_sqe.cid, ttag, true);
1357  
1358  	/* Parse received PDUs until the data transfer is complete. */
1359  	while (cb->data_xfered < cb->data_len) {
1360  		error = nvmf_tcp_receive_pdu(qp);
1361  		if (error != 0)
1362  			break;
1363  	}
1364  	tcp_free_command_buffer(cb);
1365  	return (error);
1366  }
1367  
1368  static int
tcp_receive_icd_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1369  tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1370      void *buf, size_t len)
1371  {
1372  	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1373  	const char *icd;
1374  
1375  	icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
1376  	memcpy(buf, icd, len);
1377  	return (0);
1378  }
1379  
1380  static int
tcp_receive_controller_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1381  tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1382      void *buf, size_t len)
1383  {
1384  	struct nvmf_association *na = nc->nc_qpair->nq_association;
1385  	const struct nvme_sgl_descriptor *sgl;
1386  	size_t data_len;
1387  
1388  	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1389  		return (EINVAL);
1390  
1391  	sgl = &nc->nc_sqe.sgl;
1392  	data_len = le32toh(sgl->length);
1393  	if (data_offset + len > data_len)
1394  		return (EFBIG);
1395  
1396  	if (sgl->type == NVME_SGL_TYPE_ICD)
1397  		return (tcp_receive_icd_data(nc, data_offset, buf, len));
1398  	else
1399  		return (tcp_receive_r2t_data(nc, data_offset, buf, len));
1400  }
1401  
1402  /* NB: cid is little-endian already. */
1403  static int
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,const void * buf,size_t len,bool last_pdu,bool success)1404  tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
1405      uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
1406      bool success)
1407  {
1408  	struct nvme_tcp_c2h_data_hdr c2h;
1409  
1410  	memset(&c2h, 0, sizeof(c2h));
1411  	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1412  	if (last_pdu)
1413  		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1414  	if (success)
1415  		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1416  	c2h.cccid = cid;
1417  	c2h.datao = htole32(data_offset);
1418  	c2h.datal = htole32(len);
1419  
1420  	return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
1421  	    __DECONST(void *, buf), len));
1422  }
1423  
1424  static int
tcp_send_controller_data(const struct nvmf_capsule * nc,const void * buf,size_t len)1425  tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
1426      size_t len)
1427  {
1428  	struct nvmf_association *na = nc->nc_qpair->nq_association;
1429  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1430  	const struct nvme_sgl_descriptor *sgl;
1431  	const char *src;
1432  	size_t todo;
1433  	uint32_t data_len, data_offset;
1434  	int error;
1435  	bool last_pdu, send_success_flag;
1436  
1437  	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1438  		return (EINVAL);
1439  
1440  	sgl = &nc->nc_sqe.sgl;
1441  	data_len = le32toh(sgl->length);
1442  	if (len != data_len) {
1443  		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1444  		return (EFBIG);
1445  	}
1446  
1447  	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1448  		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1449  		return (EINVAL);
1450  	}
1451  
1452  	/* Use the SUCCESS flag if SQ flow control is disabled. */
1453  	send_success_flag = !qp->qp.nq_flow_control;
1454  
1455  	/*
1456  	 * Write out one or more C2H_DATA PDUs containing the data.
1457  	 * Each PDU is arbitrarily capped at 256k.
1458  	 */
1459  	data_offset = 0;
1460  	src = buf;
1461  	while (len > 0) {
1462  		if (len > 256 * 1024) {
1463  			todo = 256 * 1024;
1464  			last_pdu = false;
1465  		} else {
1466  			todo = len;
1467  			last_pdu = true;
1468  		}
1469  		error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
1470  		    src, todo, last_pdu, last_pdu && send_success_flag);
1471  		if (error != 0) {
1472  			nvmf_send_generic_error(nc,
1473  			    NVME_SC_TRANSIENT_TRANSPORT_ERROR);
1474  			return (error);
1475  		}
1476  		data_offset += todo;
1477  		src += todo;
1478  		len -= todo;
1479  	}
1480  	if (!send_success_flag)
1481  		nvmf_send_success(nc);
1482  	return (0);
1483  }
1484  
1485  struct nvmf_transport_ops tcp_ops = {
1486  	.allocate_association = tcp_allocate_association,
1487  	.update_association = tcp_update_association,
1488  	.free_association = tcp_free_association,
1489  	.allocate_qpair = tcp_allocate_qpair,
1490  	.free_qpair = tcp_free_qpair,
1491  	.kernel_handoff_params = tcp_kernel_handoff_params,
1492  	.populate_dle = tcp_populate_dle,
1493  	.allocate_capsule = tcp_allocate_capsule,
1494  	.free_capsule = tcp_free_capsule,
1495  	.transmit_capsule = tcp_transmit_capsule,
1496  	.receive_capsule = tcp_receive_capsule,
1497  	.validate_command_capsule = tcp_validate_command_capsule,
1498  	.capsule_data_len = tcp_capsule_data_len,
1499  	.receive_controller_data = tcp_receive_controller_data,
1500  	.send_controller_data = tcp_send_controller_data,
1501  };
1502