xref: /freebsd/sys/dev/nvmf/nvmf_tcp.c (revision 365b89e8ea4af34a05f68aa28e77573e89fa00b2)
1  /*-
2   * SPDX-License-Identifier: BSD-2-Clause
3   *
4   * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5   * Written by: John Baldwin <jhb@FreeBSD.org>
6   */
7  
8  #include <sys/param.h>
9  #include <sys/capsicum.h>
10  #include <sys/condvar.h>
11  #include <sys/file.h>
12  #include <sys/gsb_crc32.h>
13  #include <sys/kernel.h>
14  #include <sys/kthread.h>
15  #include <sys/limits.h>
16  #include <sys/lock.h>
17  #include <sys/malloc.h>
18  #include <sys/mbuf.h>
19  #include <sys/module.h>
20  #include <sys/mutex.h>
21  #include <sys/nv.h>
22  #include <sys/protosw.h>
23  #include <sys/refcount.h>
24  #include <sys/socket.h>
25  #include <sys/socketvar.h>
26  #include <sys/sysctl.h>
27  #include <sys/uio.h>
28  #include <netinet/in.h>
29  #include <dev/nvme/nvme.h>
30  #include <dev/nvmf/nvmf.h>
31  #include <dev/nvmf/nvmf_proto.h>
32  #include <dev/nvmf/nvmf_tcp.h>
33  #include <dev/nvmf/nvmf_transport.h>
34  #include <dev/nvmf/nvmf_transport_internal.h>
35  
36  struct nvmf_tcp_capsule;
37  struct nvmf_tcp_qpair;
38  
39  struct nvmf_tcp_command_buffer {
40  	struct nvmf_tcp_qpair *qp;
41  
42  	struct nvmf_io_request io;
43  	size_t	data_len;
44  	size_t	data_xfered;
45  	uint32_t data_offset;
46  
47  	u_int	refs;
48  	int	error;
49  
50  	uint16_t cid;
51  	uint16_t ttag;
52  
53  	TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
54  
55  	/* Controller only */
56  	struct nvmf_tcp_capsule *tc;
57  };
58  
59  struct nvmf_tcp_command_buffer_list {
60  	TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
61  	struct mtx lock;
62  };
63  
64  struct nvmf_tcp_qpair {
65  	struct nvmf_qpair qp;
66  
67  	struct socket *so;
68  
69  	volatile u_int refs;	/* Every allocated capsule holds a reference */
70  	uint8_t	txpda;
71  	uint8_t rxpda;
72  	bool header_digests;
73  	bool data_digests;
74  	uint32_t maxr2t;
75  	uint32_t maxh2cdata;	/* Controller only */
76  	uint32_t max_tx_data;
77  	uint32_t max_icd;	/* Host only */
78  	uint16_t next_ttag;	/* Controller only */
79  	u_int num_ttags;	/* Controller only */
80  	u_int active_ttags;	/* Controller only */
81  	bool send_success;	/* Controller only */
82  
83  	/* Receive state. */
84  	struct thread *rx_thread;
85  	struct cv rx_cv;
86  	bool	rx_shutdown;
87  
88  	/* Transmit state. */
89  	struct thread *tx_thread;
90  	struct cv tx_cv;
91  	bool	tx_shutdown;
92  	struct mbufq tx_pdus;
93  	STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
94  
95  	struct nvmf_tcp_command_buffer_list tx_buffers;
96  	struct nvmf_tcp_command_buffer_list rx_buffers;
97  
98  	/*
99  	 * For the controller, an RX command buffer can be in one of
100  	 * two locations, all protected by the rx_buffers.lock.  If a
101  	 * receive request is waiting for either an R2T slot for its
102  	 * command (due to exceeding MAXR2T), or a transfer tag it is
103  	 * placed on the rx_buffers list.  When a request is allocated
104  	 * an active transfer tag, it moves to the open_ttags[] array
105  	 * (indexed by the tag) until it completes.
106  	 */
107  	struct nvmf_tcp_command_buffer **open_ttags;	/* Controller only */
108  };
109  
110  struct nvmf_tcp_rxpdu {
111  	struct mbuf *m;
112  	const struct nvme_tcp_common_pdu_hdr *hdr;
113  	uint32_t data_len;
114  	bool data_digest_mismatch;
115  };
116  
117  struct nvmf_tcp_capsule {
118  	struct nvmf_capsule nc;
119  
120  	volatile u_int refs;
121  
122  	struct nvmf_tcp_rxpdu rx_pdu;
123  
124  	uint32_t active_r2ts;		/* Controller only */
125  #ifdef INVARIANTS
126  	uint32_t tx_data_offset;	/* Controller only */
127  	u_int pending_r2ts;		/* Controller only */
128  #endif
129  
130  	STAILQ_ENTRY(nvmf_tcp_capsule) link;
131  };
132  
133  #define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
134  #define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
135  
136  static void	tcp_release_capsule(struct nvmf_tcp_capsule *tc);
137  static void	tcp_free_qpair(struct nvmf_qpair *nq);
138  
139  SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
140      "TCP transport");
141  static u_int tcp_max_transmit_data = 256 * 1024;
142  SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN,
143      &tcp_max_transmit_data, 0,
144      "Maximum size of data payload in a transmitted PDU");
145  
146  static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
147  
148  static int
mbuf_crc32c_helper(void * arg,void * data,u_int len)149  mbuf_crc32c_helper(void *arg, void *data, u_int len)
150  {
151  	uint32_t *digestp = arg;
152  
153  	*digestp = calculate_crc32c(*digestp, data, len);
154  	return (0);
155  }
156  
157  static uint32_t
mbuf_crc32c(struct mbuf * m,u_int offset,u_int len)158  mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
159  {
160  	uint32_t digest = 0xffffffff;
161  
162  	m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
163  	digest = digest ^ 0xffffffff;
164  
165  	return (digest);
166  }
167  
168  static uint32_t
compute_digest(const void * buf,size_t len)169  compute_digest(const void *buf, size_t len)
170  {
171  	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
172  }
173  
174  static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)175  tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
176      const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
177      uint16_t cid)
178  {
179  	struct nvmf_tcp_command_buffer *cb;
180  
181  	cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
182  	cb->qp = qp;
183  	cb->io = *io;
184  	cb->data_offset = data_offset;
185  	cb->data_len = data_len;
186  	cb->data_xfered = 0;
187  	refcount_init(&cb->refs, 1);
188  	cb->error = 0;
189  	cb->cid = cid;
190  	cb->ttag = 0;
191  	cb->tc = NULL;
192  
193  	return (cb);
194  }
195  
196  static void
tcp_hold_command_buffer(struct nvmf_tcp_command_buffer * cb)197  tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
198  {
199  	refcount_acquire(&cb->refs);
200  }
201  
202  static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)203  tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
204  {
205  	nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
206  	if (cb->tc != NULL)
207  		tcp_release_capsule(cb->tc);
208  	free(cb, M_NVMF_TCP);
209  }
210  
211  static void
tcp_release_command_buffer(struct nvmf_tcp_command_buffer * cb)212  tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
213  {
214  	if (refcount_release(&cb->refs))
215  		tcp_free_command_buffer(cb);
216  }
217  
218  static void
tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list * list,struct nvmf_tcp_command_buffer * cb)219  tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
220      struct nvmf_tcp_command_buffer *cb)
221  {
222  	mtx_assert(&list->lock, MA_OWNED);
223  	TAILQ_INSERT_HEAD(&list->head, cb, link);
224  }
225  
226  static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list * list,uint16_t cid,uint16_t ttag)227  tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
228      uint16_t cid, uint16_t ttag)
229  {
230  	struct nvmf_tcp_command_buffer *cb;
231  
232  	mtx_assert(&list->lock, MA_OWNED);
233  	TAILQ_FOREACH(cb, &list->head, link) {
234  		if (cb->cid == cid && cb->ttag == ttag)
235  			return (cb);
236  	}
237  	return (NULL);
238  }
239  
240  static void
tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list * list,struct nvmf_tcp_command_buffer * cb)241  tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
242      struct nvmf_tcp_command_buffer *cb)
243  {
244  	mtx_assert(&list->lock, MA_OWNED);
245  	TAILQ_REMOVE(&list->head, cb, link);
246  }
247  
248  static void
tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list * list,uint16_t cid,uint16_t ttag)249  tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
250      uint16_t cid, uint16_t ttag)
251  {
252  	struct nvmf_tcp_command_buffer *cb;
253  
254  	mtx_lock(&list->lock);
255  	cb = tcp_find_command_buffer(list, cid, ttag);
256  	if (cb != NULL) {
257  		tcp_remove_command_buffer(list, cb);
258  		mtx_unlock(&list->lock);
259  		tcp_release_command_buffer(cb);
260  	} else
261  		mtx_unlock(&list->lock);
262  }
263  
264  static void
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,struct mbuf * m)265  nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m)
266  {
267  	struct socket *so = qp->so;
268  
269  	SOCKBUF_LOCK(&so->so_snd);
270  	mbufq_enqueue(&qp->tx_pdus, m);
271  	/* XXX: Do we need to handle sb_hiwat being wrong? */
272  	if (sowriteable(so))
273  		cv_signal(&qp->tx_cv);
274  	SOCKBUF_UNLOCK(&so->so_snd);
275  }
276  
277  static void
nvmf_tcp_report_error(struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)278  nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
279      struct mbuf *rx_pdu, u_int hlen)
280  {
281  	struct nvme_tcp_term_req_hdr *hdr;
282  	struct mbuf *m;
283  
284  	if (hlen != 0) {
285  		hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
286  		hlen = min(hlen, m_length(rx_pdu, NULL));
287  	}
288  
289  	m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
290  	m->m_len = sizeof(*hdr) + hlen;
291  	hdr = mtod(m, void *);
292  	memset(hdr, 0, sizeof(*hdr));
293  	hdr->common.pdu_type = qp->qp.nq_controller ?
294  	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
295  	hdr->common.hlen = sizeof(*hdr);
296  	hdr->common.plen = sizeof(*hdr) + hlen;
297  	hdr->fes = htole16(fes);
298  	le32enc(hdr->fei, fei);
299  	if (hlen != 0)
300  		m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
301  
302  	nvmf_tcp_write_pdu(qp, m);
303  }
304  
305  static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)306  nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
307  {
308  	const struct nvme_tcp_common_pdu_hdr *ch;
309  	struct mbuf *m = pdu->m;
310  	uint32_t data_len, fei, plen;
311  	uint32_t digest, rx_digest;
312  	u_int hlen;
313  	int error;
314  	uint16_t fes;
315  
316  	/* Determine how large of a PDU header to return for errors. */
317  	ch = pdu->hdr;
318  	hlen = ch->hlen;
319  	plen = le32toh(ch->plen);
320  	if (hlen < sizeof(*ch) || hlen > plen)
321  		hlen = sizeof(*ch);
322  
323  	error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
324  	    qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
325  	    &fei);
326  	if (error != 0) {
327  		if (error != ECONNRESET)
328  			nvmf_tcp_report_error(qp, fes, fei, m, hlen);
329  		return (error);
330  	}
331  
332  	/* Check header digest if present. */
333  	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
334  		digest = mbuf_crc32c(m, 0, ch->hlen);
335  		m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
336  		if (digest != rx_digest) {
337  			printf("NVMe/TCP: Header digest mismatch\n");
338  			nvmf_tcp_report_error(qp,
339  			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
340  			    hlen);
341  			return (EBADMSG);
342  		}
343  	}
344  
345  	/* Check data digest if present. */
346  	pdu->data_digest_mismatch = false;
347  	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
348  		digest = mbuf_crc32c(m, ch->pdo, data_len);
349  		m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
350  		    (caddr_t)&rx_digest);
351  		if (digest != rx_digest) {
352  			printf("NVMe/TCP: Data digest mismatch\n");
353  			pdu->data_digest_mismatch = true;
354  		}
355  	}
356  
357  	pdu->data_len = data_len;
358  	return (0);
359  }
360  
361  static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)362  nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
363  {
364  	m_freem(pdu->m);
365  	pdu->m = NULL;
366  	pdu->hdr = NULL;
367  }
368  
369  static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)370  nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
371  {
372  	const struct nvme_tcp_term_req_hdr *hdr;
373  
374  	hdr = (const void *)pdu->hdr;
375  
376  	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
377  	    le16toh(hdr->fes), le32dec(hdr->fei));
378  	nvmf_tcp_free_pdu(pdu);
379  	return (ECONNRESET);
380  }
381  
382  static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)383  nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
384      struct nvmf_tcp_rxpdu *pdu)
385  {
386  	const struct nvme_tcp_cmd *cmd;
387  	struct nvmf_capsule *nc;
388  	struct nvmf_tcp_capsule *tc;
389  
390  	cmd = (const void *)pdu->hdr;
391  
392  	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
393  
394  	tc = TCAP(nc);
395  	tc->rx_pdu = *pdu;
396  
397  	nvmf_capsule_received(&qp->qp, nc);
398  	return (0);
399  }
400  
401  static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)402  nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
403      struct nvmf_tcp_rxpdu *pdu)
404  {
405  	const struct nvme_tcp_rsp *rsp;
406  	struct nvmf_capsule *nc;
407  	struct nvmf_tcp_capsule *tc;
408  
409  	rsp = (const void *)pdu->hdr;
410  
411  	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
412  
413  	nc->nc_sqhd_valid = true;
414  	tc = TCAP(nc);
415  	tc->rx_pdu = *pdu;
416  
417  	/*
418  	 * Once the CQE has been received, no further transfers to the
419  	 * command buffer for the associated CID can occur.
420  	 */
421  	tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
422  	tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
423  
424  	nvmf_capsule_received(&qp->qp, nc);
425  	return (0);
426  }
427  
428  /*
429   * Construct a PDU that contains an optional data payload.  This
430   * includes dealing with digests and the length fields in the common
431   * header.
432   */
433  static struct mbuf *
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)434  nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
435      struct mbuf *data, uint32_t data_len)
436  {
437  	struct nvme_tcp_common_pdu_hdr *ch;
438  	struct mbuf *top;
439  	uint32_t digest, pad, pdo, plen, mlen;
440  
441  	plen = hlen;
442  	if (qp->header_digests)
443  		plen += sizeof(digest);
444  	if (data_len != 0) {
445  		KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
446  		pdo = roundup(plen, qp->txpda);
447  		pad = pdo - plen;
448  		plen = pdo + data_len;
449  		if (qp->data_digests)
450  			plen += sizeof(digest);
451  		mlen = pdo;
452  	} else {
453  		KASSERT(data == NULL, ("payload mbuf with zero length"));
454  		pdo = 0;
455  		pad = 0;
456  		mlen = plen;
457  	}
458  
459  	top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
460  	top->m_len = mlen;
461  	ch = mtod(top, void *);
462  	memcpy(ch, hdr, hlen);
463  	ch->hlen = hlen;
464  	if (qp->header_digests)
465  		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
466  	if (qp->data_digests && data_len != 0)
467  		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
468  	ch->pdo = pdo;
469  	ch->plen = htole32(plen);
470  
471  	/* HDGST */
472  	if (qp->header_digests) {
473  		digest = compute_digest(ch, hlen);
474  		memcpy((char *)ch + hlen, &digest, sizeof(digest));
475  	}
476  
477  	if (pad != 0) {
478  		/* PAD */
479  		memset((char *)ch + pdo - pad, 0, pad);
480  	}
481  
482  	if (data_len != 0) {
483  		/* DATA */
484  		top->m_next = data;
485  
486  		/* DDGST */
487  		if (qp->data_digests) {
488  			digest = mbuf_crc32c(data, 0, data_len);
489  
490  			/* XXX: Can't use m_append as it uses M_NOWAIT. */
491  			while (data->m_next != NULL)
492  				data = data->m_next;
493  
494  			data->m_next = m_get(M_WAITOK, MT_DATA);
495  			data->m_next->m_len = sizeof(digest);
496  			memcpy(mtod(data->m_next, void *), &digest,
497  			    sizeof(digest));
498  		}
499  	}
500  
501  	return (top);
502  }
503  
504  /* Find the next command buffer eligible to schedule for R2T. */
505  static struct nvmf_tcp_command_buffer *
nvmf_tcp_next_r2t(struct nvmf_tcp_qpair * qp)506  nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
507  {
508  	struct nvmf_tcp_command_buffer *cb;
509  
510  	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
511  	MPASS(qp->active_ttags < qp->num_ttags);
512  
513  	TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
514  		/* NB: maxr2t is 0's based. */
515  		if (cb->tc->active_r2ts > qp->maxr2t)
516  			continue;
517  #ifdef INVARIANTS
518  		cb->tc->pending_r2ts--;
519  #endif
520  		TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
521  		return (cb);
522  	}
523  	return (NULL);
524  }
525  
526  /* Allocate the next free transfer tag and assign it to cb. */
527  static void
nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_command_buffer * cb)528  nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
529      struct nvmf_tcp_command_buffer *cb)
530  {
531  	uint16_t ttag;
532  
533  	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
534  
535  	ttag = qp->next_ttag;
536  	for (;;) {
537  		if (qp->open_ttags[ttag] == NULL)
538  			break;
539  		if (ttag == qp->num_ttags - 1)
540  			ttag = 0;
541  		else
542  			ttag++;
543  		MPASS(ttag != qp->next_ttag);
544  	}
545  	if (ttag == qp->num_ttags - 1)
546  		qp->next_ttag = 0;
547  	else
548  		qp->next_ttag = ttag + 1;
549  
550  	cb->tc->active_r2ts++;
551  	qp->active_ttags++;
552  	qp->open_ttags[ttag] = cb;
553  
554  	/*
555  	 * Don't bother byte-swapping ttag as it is just a cookie
556  	 * value returned by the other end as-is.
557  	 */
558  	cb->ttag = ttag;
559  }
560  
561  /* NB: cid and ttag are both little-endian already. */
562  static void
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)563  tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
564      uint32_t data_offset, uint32_t data_len)
565  {
566  	struct nvme_tcp_r2t_hdr r2t;
567  	struct mbuf *m;
568  
569  	memset(&r2t, 0, sizeof(r2t));
570  	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
571  	r2t.cccid = cid;
572  	r2t.ttag = ttag;
573  	r2t.r2to = htole32(data_offset);
574  	r2t.r2tl = htole32(data_len);
575  
576  	m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
577  	nvmf_tcp_write_pdu(qp, m);
578  }
579  
580  /*
581   * Release a transfer tag and schedule another R2T.
582   *
583   * NB: This drops the rx_buffers.lock mutex.
584   */
585  static void
nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_command_buffer * cb)586  nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
587      struct nvmf_tcp_command_buffer *cb)
588  {
589  	struct nvmf_tcp_command_buffer *ncb;
590  
591  	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
592  	MPASS(qp->open_ttags[cb->ttag] == cb);
593  
594  	/* Release this transfer tag. */
595  	qp->open_ttags[cb->ttag] = NULL;
596  	qp->active_ttags--;
597  	cb->tc->active_r2ts--;
598  
599  	/* Schedule another R2T. */
600  	ncb = nvmf_tcp_next_r2t(qp);
601  	if (ncb != NULL) {
602  		nvmf_tcp_allocate_ttag(qp, ncb);
603  		mtx_unlock(&qp->rx_buffers.lock);
604  		tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
605  		    ncb->data_len);
606  	} else
607  		mtx_unlock(&qp->rx_buffers.lock);
608  }
609  
610  /*
611   * Copy len bytes starting at offset skip from an mbuf chain into an
612   * I/O buffer at destination offset io_offset.
613   */
614  static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)615  mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
616      struct nvmf_io_request *io, u_int io_offset)
617  {
618  	u_int todo;
619  
620  	while (m->m_len <= skip) {
621  		skip -= m->m_len;
622  		m = m->m_next;
623  	}
624  	while (len != 0) {
625  		MPASS((m->m_flags & M_EXTPG) == 0);
626  
627  		todo = min(m->m_len - skip, len);
628  		memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
629  		skip = 0;
630  		io_offset += todo;
631  		len -= todo;
632  		m = m->m_next;
633  	}
634  }
635  
636  static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)637  nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
638  {
639  	const struct nvme_tcp_h2c_data_hdr *h2c;
640  	struct nvmf_tcp_command_buffer *cb;
641  	uint32_t data_len, data_offset;
642  	uint16_t ttag;
643  
644  	h2c = (const void *)pdu->hdr;
645  	if (le32toh(h2c->datal) > qp->maxh2cdata) {
646  		nvmf_tcp_report_error(qp,
647  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
648  		    pdu->m, pdu->hdr->hlen);
649  		nvmf_tcp_free_pdu(pdu);
650  		return (EBADMSG);
651  	}
652  
653  	/*
654  	 * NB: Don't bother byte-swapping ttag as we don't byte-swap
655  	 * it when sending.
656  	 */
657  	ttag = h2c->ttag;
658  	if (ttag >= qp->num_ttags) {
659  		nvmf_tcp_report_error(qp,
660  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
661  		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
662  		    pdu->hdr->hlen);
663  		nvmf_tcp_free_pdu(pdu);
664  		return (EBADMSG);
665  	}
666  
667  	mtx_lock(&qp->rx_buffers.lock);
668  	cb = qp->open_ttags[ttag];
669  	if (cb == NULL) {
670  		mtx_unlock(&qp->rx_buffers.lock);
671  		nvmf_tcp_report_error(qp,
672  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
673  		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
674  		    pdu->hdr->hlen);
675  		nvmf_tcp_free_pdu(pdu);
676  		return (EBADMSG);
677  	}
678  	MPASS(cb->ttag == ttag);
679  
680  	/* For a data digest mismatch, fail the I/O request. */
681  	if (pdu->data_digest_mismatch) {
682  		nvmf_tcp_send_next_r2t(qp, cb);
683  		cb->error = EINTEGRITY;
684  		tcp_release_command_buffer(cb);
685  		nvmf_tcp_free_pdu(pdu);
686  		return (0);
687  	}
688  
689  	data_len = le32toh(h2c->datal);
690  	if (data_len != pdu->data_len) {
691  		mtx_unlock(&qp->rx_buffers.lock);
692  		nvmf_tcp_report_error(qp,
693  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
694  		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
695  		    pdu->hdr->hlen);
696  		nvmf_tcp_free_pdu(pdu);
697  		return (EBADMSG);
698  	}
699  
700  	data_offset = le32toh(h2c->datao);
701  	if (data_offset < cb->data_offset ||
702  	    data_offset + data_len > cb->data_offset + cb->data_len) {
703  		mtx_unlock(&qp->rx_buffers.lock);
704  		nvmf_tcp_report_error(qp,
705  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
706  		    pdu->hdr->hlen);
707  		nvmf_tcp_free_pdu(pdu);
708  		return (EBADMSG);
709  	}
710  
711  	if (data_offset != cb->data_offset + cb->data_xfered) {
712  		mtx_unlock(&qp->rx_buffers.lock);
713  		nvmf_tcp_report_error(qp,
714  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
715  		    pdu->hdr->hlen);
716  		nvmf_tcp_free_pdu(pdu);
717  		return (EBADMSG);
718  	}
719  
720  	if ((cb->data_xfered + data_len == cb->data_len) !=
721  	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
722  		mtx_unlock(&qp->rx_buffers.lock);
723  		nvmf_tcp_report_error(qp,
724  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
725  		    pdu->hdr->hlen);
726  		nvmf_tcp_free_pdu(pdu);
727  		return (EBADMSG);
728  	}
729  
730  	cb->data_xfered += data_len;
731  	data_offset -= cb->data_offset;
732  	if (cb->data_xfered == cb->data_len) {
733  		nvmf_tcp_send_next_r2t(qp, cb);
734  	} else {
735  		tcp_hold_command_buffer(cb);
736  		mtx_unlock(&qp->rx_buffers.lock);
737  	}
738  
739  	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
740  
741  	tcp_release_command_buffer(cb);
742  	nvmf_tcp_free_pdu(pdu);
743  	return (0);
744  }
745  
746  static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)747  nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
748  {
749  	const struct nvme_tcp_c2h_data_hdr *c2h;
750  	struct nvmf_tcp_command_buffer *cb;
751  	uint32_t data_len, data_offset;
752  
753  	c2h = (const void *)pdu->hdr;
754  
755  	mtx_lock(&qp->rx_buffers.lock);
756  	cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
757  	if (cb == NULL) {
758  		mtx_unlock(&qp->rx_buffers.lock);
759  		/*
760  		 * XXX: Could be PDU sequence error if cccid is for a
761  		 * command that doesn't use a command buffer.
762  		 */
763  		nvmf_tcp_report_error(qp,
764  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
765  		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
766  		    pdu->hdr->hlen);
767  		nvmf_tcp_free_pdu(pdu);
768  		return (EBADMSG);
769  	}
770  
771  	/* For a data digest mismatch, fail the I/O request. */
772  	if (pdu->data_digest_mismatch) {
773  		cb->error = EINTEGRITY;
774  		tcp_remove_command_buffer(&qp->rx_buffers, cb);
775  		mtx_unlock(&qp->rx_buffers.lock);
776  		tcp_release_command_buffer(cb);
777  		nvmf_tcp_free_pdu(pdu);
778  		return (0);
779  	}
780  
781  	data_len = le32toh(c2h->datal);
782  	if (data_len != pdu->data_len) {
783  		mtx_unlock(&qp->rx_buffers.lock);
784  		nvmf_tcp_report_error(qp,
785  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
786  		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
787  		    pdu->hdr->hlen);
788  		nvmf_tcp_free_pdu(pdu);
789  		return (EBADMSG);
790  	}
791  
792  	data_offset = le32toh(c2h->datao);
793  	if (data_offset < cb->data_offset ||
794  	    data_offset + data_len > cb->data_offset + cb->data_len) {
795  		mtx_unlock(&qp->rx_buffers.lock);
796  		nvmf_tcp_report_error(qp,
797  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
798  		    pdu->m, pdu->hdr->hlen);
799  		nvmf_tcp_free_pdu(pdu);
800  		return (EBADMSG);
801  	}
802  
803  	if (data_offset != cb->data_offset + cb->data_xfered) {
804  		mtx_unlock(&qp->rx_buffers.lock);
805  		nvmf_tcp_report_error(qp,
806  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
807  		    pdu->hdr->hlen);
808  		nvmf_tcp_free_pdu(pdu);
809  		return (EBADMSG);
810  	}
811  
812  	if ((cb->data_xfered + data_len == cb->data_len) !=
813  	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
814  		mtx_unlock(&qp->rx_buffers.lock);
815  		nvmf_tcp_report_error(qp,
816  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
817  		    pdu->hdr->hlen);
818  		nvmf_tcp_free_pdu(pdu);
819  		return (EBADMSG);
820  	}
821  
822  	cb->data_xfered += data_len;
823  	data_offset -= cb->data_offset;
824  	if (cb->data_xfered == cb->data_len)
825  		tcp_remove_command_buffer(&qp->rx_buffers, cb);
826  	else
827  		tcp_hold_command_buffer(cb);
828  	mtx_unlock(&qp->rx_buffers.lock);
829  
830  	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
831  
832  	tcp_release_command_buffer(cb);
833  
834  	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
835  		struct nvme_completion cqe;
836  		struct nvmf_capsule *nc;
837  
838  		memset(&cqe, 0, sizeof(cqe));
839  		cqe.cid = c2h->cccid;
840  
841  		nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
842  		nc->nc_sqhd_valid = false;
843  
844  		nvmf_capsule_received(&qp->qp, nc);
845  	}
846  
847  	nvmf_tcp_free_pdu(pdu);
848  	return (0);
849  }
850  
851  /* Called when m_free drops refcount to 0. */
852  static void
nvmf_tcp_mbuf_done(struct mbuf * m)853  nvmf_tcp_mbuf_done(struct mbuf *m)
854  {
855  	struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
856  
857  	tcp_free_command_buffer(cb);
858  }
859  
860  static struct mbuf *
nvmf_tcp_mbuf(void * arg,int how,void * data,size_t len)861  nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len)
862  {
863  	struct nvmf_tcp_command_buffer *cb = arg;
864  	struct mbuf *m;
865  
866  	m = m_get(how, MT_DATA);
867  	m->m_flags |= M_RDONLY;
868  	m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL);
869  	m->m_len = len;
870  	return (m);
871  }
872  
873  static void
nvmf_tcp_free_mext_pg(struct mbuf * m)874  nvmf_tcp_free_mext_pg(struct mbuf *m)
875  {
876  	struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
877  
878  	M_ASSERTEXTPG(m);
879  	tcp_release_command_buffer(cb);
880  }
881  
882  static struct mbuf *
nvmf_tcp_mext_pg(void * arg,int how)883  nvmf_tcp_mext_pg(void *arg, int how)
884  {
885  	struct nvmf_tcp_command_buffer *cb = arg;
886  	struct mbuf *m;
887  
888  	m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY);
889  	m->m_ext.ext_arg1 = cb;
890  	tcp_hold_command_buffer(cb);
891  	return (m);
892  }
893  
894  /*
895   * Return an mbuf chain for a range of data belonging to a command
896   * buffer.
897   *
898   * The mbuf chain uses M_EXT mbufs which hold references on the
899   * command buffer so that it remains "alive" until the data has been
900   * fully transmitted.  If truncate_ok is true, then the mbuf chain
901   * might return a short chain to avoid gratuitously splitting up a
902   * page.
903   */
904  static struct mbuf *
nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)905  nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb,
906      uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
907      bool can_truncate)
908  {
909  	struct mbuf *m;
910  	size_t len;
911  
912  	m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf,
913  	    nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
914  	    can_truncate);
915  	if (actual_len != NULL)
916  		*actual_len = len;
917  	return (m);
918  }
919  
920  /* NB: cid and ttag and little-endian already. */
921  static void
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)922  tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
923      uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
924  {
925  	struct nvme_tcp_h2c_data_hdr h2c;
926  	struct mbuf *top;
927  
928  	memset(&h2c, 0, sizeof(h2c));
929  	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
930  	if (last_pdu)
931  		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
932  	h2c.cccid = cid;
933  	h2c.ttag = ttag;
934  	h2c.datao = htole32(data_offset);
935  	h2c.datal = htole32(len);
936  
937  	top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
938  	nvmf_tcp_write_pdu(qp, top);
939  }
940  
941  static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)942  nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
943  {
944  	const struct nvme_tcp_r2t_hdr *r2t;
945  	struct nvmf_tcp_command_buffer *cb;
946  	uint32_t data_len, data_offset;
947  
948  	r2t = (const void *)pdu->hdr;
949  
950  	mtx_lock(&qp->tx_buffers.lock);
951  	cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0);
952  	if (cb == NULL) {
953  		mtx_unlock(&qp->tx_buffers.lock);
954  		nvmf_tcp_report_error(qp,
955  		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
956  		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
957  		    pdu->hdr->hlen);
958  		nvmf_tcp_free_pdu(pdu);
959  		return (EBADMSG);
960  	}
961  
962  	data_offset = le32toh(r2t->r2to);
963  	if (data_offset != cb->data_xfered) {
964  		mtx_unlock(&qp->tx_buffers.lock);
965  		nvmf_tcp_report_error(qp,
966  		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
967  		    pdu->hdr->hlen);
968  		nvmf_tcp_free_pdu(pdu);
969  		return (EBADMSG);
970  	}
971  
972  	/*
973  	 * XXX: The spec does not specify how to handle R2T tranfers
974  	 * out of range of the original command.
975  	 */
976  	data_len = le32toh(r2t->r2tl);
977  	if (data_offset + data_len > cb->data_len) {
978  		mtx_unlock(&qp->tx_buffers.lock);
979  		nvmf_tcp_report_error(qp,
980  		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
981  		    pdu->m, pdu->hdr->hlen);
982  		nvmf_tcp_free_pdu(pdu);
983  		return (EBADMSG);
984  	}
985  
986  	cb->data_xfered += data_len;
987  	if (cb->data_xfered == cb->data_len)
988  		tcp_remove_command_buffer(&qp->tx_buffers, cb);
989  	else
990  		tcp_hold_command_buffer(cb);
991  	mtx_unlock(&qp->tx_buffers.lock);
992  
993  	/*
994  	 * Queue one or more H2C_DATA PDUs containing the requested
995  	 * data.
996  	 */
997  	while (data_len > 0) {
998  		struct mbuf *m;
999  		uint32_t sent, todo;
1000  
1001  		todo = min(data_len, qp->max_tx_data);
1002  		m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent,
1003  		    todo < data_len);
1004  		tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1005  		    sent, sent == data_len);
1006  
1007  		data_offset += sent;
1008  		data_len -= sent;
1009  	}
1010  
1011  	tcp_release_command_buffer(cb);
1012  	nvmf_tcp_free_pdu(pdu);
1013  	return (0);
1014  }
1015  
1016  /*
1017   * A variant of m_pullup that uses M_WAITOK instead of failing.  It
1018   * also doesn't do anything if enough bytes are already present in the
1019   * first mbuf.
1020   */
1021  static struct mbuf *
pullup_pdu_hdr(struct mbuf * m,int len)1022  pullup_pdu_hdr(struct mbuf *m, int len)
1023  {
1024  	struct mbuf *n, *p;
1025  
1026  	KASSERT(len <= MCLBYTES, ("%s: len too large", __func__));
1027  	if (m->m_len >= len)
1028  		return (m);
1029  
1030  	n = m_get2(len, M_WAITOK, MT_DATA, 0);
1031  	n->m_len = len;
1032  	m_copydata(m, 0, len, mtod(n, void *));
1033  
1034  	while (m != NULL && m->m_len <= len) {
1035  		p = m->m_next;
1036  		len -= m->m_len;
1037  		m_free(m);
1038  		m = p;
1039  	}
1040  	if (len > 0) {
1041  		m->m_data += len;
1042  		m->m_len -= len;
1043  	}
1044  	n->m_next = m;
1045  	return (n);
1046  }
1047  
1048  static int
nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,struct nvmf_tcp_rxpdu * pdu)1049  nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp,
1050      const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu)
1051  {
1052  	/* Ensure the PDU header is contiguous. */
1053  	pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen);
1054  	pdu->hdr = mtod(pdu->m, const void *);
1055  
1056  	switch (ch->pdu_type) {
1057  	default:
1058  		__assert_unreachable();
1059  		break;
1060  	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1061  	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1062  		return (nvmf_tcp_handle_term_req(pdu));
1063  	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1064  		return (nvmf_tcp_save_command_capsule(qp, pdu));
1065  	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1066  		return (nvmf_tcp_save_response_capsule(qp, pdu));
1067  	case NVME_TCP_PDU_TYPE_H2C_DATA:
1068  		return (nvmf_tcp_handle_h2c_data(qp, pdu));
1069  	case NVME_TCP_PDU_TYPE_C2H_DATA:
1070  		return (nvmf_tcp_handle_c2h_data(qp, pdu));
1071  	case NVME_TCP_PDU_TYPE_R2T:
1072  		return (nvmf_tcp_handle_r2t(qp, pdu));
1073  	}
1074  }
1075  
1076  static void
nvmf_tcp_receive(void * arg)1077  nvmf_tcp_receive(void *arg)
1078  {
1079  	struct nvmf_tcp_qpair *qp = arg;
1080  	struct socket *so = qp->so;
1081  	struct nvmf_tcp_rxpdu pdu;
1082  	struct nvme_tcp_common_pdu_hdr ch;
1083  	struct uio uio;
1084  	struct iovec iov[1];
1085  	struct mbuf *m, *n, *tail;
1086  	u_int avail, needed;
1087  	int error, flags, terror;
1088  	bool have_header;
1089  
1090  	m = tail = NULL;
1091  	have_header = false;
1092  	SOCKBUF_LOCK(&so->so_rcv);
1093  	while (!qp->rx_shutdown) {
1094  		/* Wait until there is enough data for the next step. */
1095  		if (so->so_error != 0 || so->so_rerror != 0) {
1096  			if (so->so_error != 0)
1097  				error = so->so_error;
1098  			else
1099  				error = so->so_rerror;
1100  			SOCKBUF_UNLOCK(&so->so_rcv);
1101  		error:
1102  			m_freem(m);
1103  			nvmf_qpair_error(&qp->qp, error);
1104  			SOCKBUF_LOCK(&so->so_rcv);
1105  			while (!qp->rx_shutdown)
1106  				cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1107  			break;
1108  		}
1109  		avail = sbavail(&so->so_rcv);
1110  		if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1111  			if (!have_header && avail == 0)
1112  				error = 0;
1113  			else
1114  				error = ECONNRESET;
1115  			SOCKBUF_UNLOCK(&so->so_rcv);
1116  			goto error;
1117  		}
1118  		if (avail == 0 || (!have_header && avail < sizeof(ch))) {
1119  			cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1120  			continue;
1121  		}
1122  		SOCKBUF_UNLOCK(&so->so_rcv);
1123  
1124  		if (!have_header) {
1125  			KASSERT(m == NULL, ("%s: m != NULL but no header",
1126  			    __func__));
1127  			memset(&uio, 0, sizeof(uio));
1128  			iov[0].iov_base = &ch;
1129  			iov[0].iov_len = sizeof(ch);
1130  			uio.uio_iov = iov;
1131  			uio.uio_iovcnt = 1;
1132  			uio.uio_resid = sizeof(ch);
1133  			uio.uio_segflg = UIO_SYSSPACE;
1134  			uio.uio_rw = UIO_READ;
1135  			flags = MSG_DONTWAIT | MSG_PEEK;
1136  
1137  			error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
1138  			if (error != 0)
1139  				goto error;
1140  			KASSERT(uio.uio_resid == 0, ("%s: short CH read",
1141  			    __func__));
1142  
1143  			have_header = true;
1144  			needed = le32toh(ch.plen);
1145  
1146  			/*
1147  			 * Malformed PDUs will be reported as errors
1148  			 * by nvmf_tcp_validate_pdu.  Just pass along
1149  			 * garbage headers if the lengths mismatch.
1150  			 */
1151  			if (needed < sizeof(ch) || ch.hlen > needed)
1152  				needed = sizeof(ch);
1153  
1154  			memset(&uio, 0, sizeof(uio));
1155  			uio.uio_resid = needed;
1156  		}
1157  
1158  		flags = MSG_DONTWAIT;
1159  		error = soreceive(so, NULL, &uio, &n, NULL, &flags);
1160  		if (error != 0)
1161  			goto error;
1162  
1163  		if (m == NULL)
1164  			m = n;
1165  		else
1166  			tail->m_next = n;
1167  
1168  		if (uio.uio_resid != 0) {
1169  			tail = n;
1170  			while (tail->m_next != NULL)
1171  				tail = tail->m_next;
1172  
1173  			SOCKBUF_LOCK(&so->so_rcv);
1174  			continue;
1175  		}
1176  #ifdef INVARIANTS
1177  		tail = NULL;
1178  #endif
1179  
1180  		pdu.m = m;
1181  		m = NULL;
1182  		pdu.hdr = &ch;
1183  		error = nvmf_tcp_validate_pdu(qp, &pdu);
1184  		if (error != 0)
1185  			m_freem(pdu.m);
1186  		else
1187  			error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu);
1188  		if (error != 0) {
1189  			/*
1190  			 * If we received a termination request, close
1191  			 * the connection immediately.
1192  			 */
1193  			if (error == ECONNRESET)
1194  				goto error;
1195  
1196  			/*
1197  			 * Wait for up to 30 seconds for the socket to
1198  			 * be closed by the other end.
1199  			 */
1200  			SOCKBUF_LOCK(&so->so_rcv);
1201  			if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1202  				terror = cv_timedwait(&qp->rx_cv,
1203  				    SOCKBUF_MTX(&so->so_rcv), 30 * hz);
1204  				if (terror == ETIMEDOUT)
1205  					printf("NVMe/TCP: Timed out after sending terminate request\n");
1206  			}
1207  			SOCKBUF_UNLOCK(&so->so_rcv);
1208  			goto error;
1209  		}
1210  
1211  		have_header = false;
1212  		SOCKBUF_LOCK(&so->so_rcv);
1213  	}
1214  	SOCKBUF_UNLOCK(&so->so_rcv);
1215  	kthread_exit();
1216  }
1217  
1218  static struct mbuf *
tcp_command_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1219  tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1220  {
1221  	struct nvmf_capsule *nc = &tc->nc;
1222  	struct nvmf_tcp_command_buffer *cb;
1223  	struct nvme_sgl_descriptor *sgl;
1224  	struct nvme_tcp_cmd cmd;
1225  	struct mbuf *top, *m;
1226  	bool use_icd;
1227  
1228  	use_icd = false;
1229  	cb = NULL;
1230  	m = NULL;
1231  
1232  	if (nc->nc_data.io_len != 0) {
1233  		cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0,
1234  		    nc->nc_data.io_len, nc->nc_sqe.cid);
1235  
1236  		if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
1237  			use_icd = true;
1238  			m = nvmf_tcp_command_buffer_mbuf(cb, 0,
1239  			    nc->nc_data.io_len, NULL, false);
1240  			cb->data_xfered = nc->nc_data.io_len;
1241  			tcp_release_command_buffer(cb);
1242  		} else if (nc->nc_send_data) {
1243  			mtx_lock(&qp->tx_buffers.lock);
1244  			tcp_add_command_buffer(&qp->tx_buffers, cb);
1245  			mtx_unlock(&qp->tx_buffers.lock);
1246  		} else {
1247  			mtx_lock(&qp->rx_buffers.lock);
1248  			tcp_add_command_buffer(&qp->rx_buffers, cb);
1249  			mtx_unlock(&qp->rx_buffers.lock);
1250  		}
1251  	}
1252  
1253  	memset(&cmd, 0, sizeof(cmd));
1254  	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1255  	cmd.ccsqe = nc->nc_sqe;
1256  
1257  	/* Populate SGL in SQE. */
1258  	sgl = &cmd.ccsqe.sgl;
1259  	memset(sgl, 0, sizeof(*sgl));
1260  	sgl->address = 0;
1261  	sgl->length = htole32(nc->nc_data.io_len);
1262  	if (use_icd) {
1263  		/* Use in-capsule data. */
1264  		sgl->type = NVME_SGL_TYPE_ICD;
1265  	} else {
1266  		/* Use a command buffer. */
1267  		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1268  	}
1269  
1270  	top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
1271  	    nc->nc_data.io_len : 0);
1272  	return (top);
1273  }
1274  
1275  static struct mbuf *
tcp_response_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1276  tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1277  {
1278  	struct nvmf_capsule *nc = &tc->nc;
1279  	struct nvme_tcp_rsp rsp;
1280  
1281  	memset(&rsp, 0, sizeof(rsp));
1282  	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1283  	rsp.rccqe = nc->nc_cqe;
1284  
1285  	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1286  }
1287  
1288  static struct mbuf *
capsule_to_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1289  capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1290  {
1291  	if (tc->nc.nc_qe_len == sizeof(struct nvme_command))
1292  		return (tcp_command_pdu(qp, tc));
1293  	else
1294  		return (tcp_response_pdu(qp, tc));
1295  }
1296  
1297  static void
nvmf_tcp_send(void * arg)1298  nvmf_tcp_send(void *arg)
1299  {
1300  	struct nvmf_tcp_qpair *qp = arg;
1301  	struct nvmf_tcp_capsule *tc;
1302  	struct socket *so = qp->so;
1303  	struct mbuf *m, *n, *p;
1304  	u_long space, tosend;
1305  	int error;
1306  
1307  	m = NULL;
1308  	SOCKBUF_LOCK(&so->so_snd);
1309  	while (!qp->tx_shutdown) {
1310  		if (so->so_error != 0) {
1311  			error = so->so_error;
1312  			SOCKBUF_UNLOCK(&so->so_snd);
1313  		error:
1314  			m_freem(m);
1315  			nvmf_qpair_error(&qp->qp, error);
1316  			SOCKBUF_LOCK(&so->so_snd);
1317  			while (!qp->tx_shutdown)
1318  				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1319  			break;
1320  		}
1321  
1322  		if (m == NULL) {
1323  			/* Next PDU to send. */
1324  			m = mbufq_dequeue(&qp->tx_pdus);
1325  		}
1326  		if (m == NULL) {
1327  			if (STAILQ_EMPTY(&qp->tx_capsules)) {
1328  				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1329  				continue;
1330  			}
1331  
1332  			/* Convert a capsule into a PDU. */
1333  			tc = STAILQ_FIRST(&qp->tx_capsules);
1334  			STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
1335  			SOCKBUF_UNLOCK(&so->so_snd);
1336  
1337  			n = capsule_to_pdu(qp, tc);
1338  			tcp_release_capsule(tc);
1339  
1340  			SOCKBUF_LOCK(&so->so_snd);
1341  			mbufq_enqueue(&qp->tx_pdus, n);
1342  			continue;
1343  		}
1344  
1345  		/*
1346  		 * Wait until there is enough room to send some data.
1347  		 * If the socket buffer is empty, always send at least
1348  		 * something.
1349  		 */
1350  		space = sbspace(&so->so_snd);
1351  		if (space < m->m_len && sbused(&so->so_snd) != 0) {
1352  			cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1353  			continue;
1354  		}
1355  		SOCKBUF_UNLOCK(&so->so_snd);
1356  
1357  		/*
1358  		 * If 'm' is too big, then the socket buffer must be
1359  		 * empty.  Split 'm' to make at least some forward
1360  		 * progress.
1361  		 *
1362  		 * Otherwise, chain up as many pending mbufs from 'm'
1363  		 * that will fit.
1364  		 */
1365  		if (m->m_len > space) {
1366  			n = m_split(m, space, M_WAITOK);
1367  		} else {
1368  			tosend = m->m_len;
1369  			n = m->m_next;
1370  			p = m;
1371  			while (n != NULL && tosend + n->m_len <= space) {
1372  				tosend += n->m_len;
1373  				p = n;
1374  				n = n->m_next;
1375  			}
1376  			KASSERT(p->m_next == n, ("%s: p not before n",
1377  			    __func__));
1378  			p->m_next = NULL;
1379  
1380  			KASSERT(m_length(m, NULL) == tosend,
1381  			    ("%s: length mismatch", __func__));
1382  		}
1383  		error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL);
1384  		if (error != 0) {
1385  			m = NULL;
1386  			m_freem(n);
1387  			goto error;
1388  		}
1389  		m = n;
1390  		SOCKBUF_LOCK(&so->so_snd);
1391  	}
1392  	SOCKBUF_UNLOCK(&so->so_snd);
1393  	kthread_exit();
1394  }
1395  
1396  static int
nvmf_soupcall_receive(struct socket * so,void * arg,int waitflag)1397  nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag)
1398  {
1399  	struct nvmf_tcp_qpair *qp = arg;
1400  
1401  	if (soreadable(so))
1402  		cv_signal(&qp->rx_cv);
1403  	return (SU_OK);
1404  }
1405  
1406  static int
nvmf_soupcall_send(struct socket * so,void * arg,int waitflag)1407  nvmf_soupcall_send(struct socket *so, void *arg, int waitflag)
1408  {
1409  	struct nvmf_tcp_qpair *qp = arg;
1410  
1411  	if (sowriteable(so))
1412  		cv_signal(&qp->tx_cv);
1413  	return (SU_OK);
1414  }
1415  
1416  static struct nvmf_qpair *
tcp_allocate_qpair(bool controller,const nvlist_t * nvl)1417  tcp_allocate_qpair(bool controller, const nvlist_t *nvl)
1418  {
1419  	struct nvmf_tcp_qpair *qp;
1420  	struct socket *so;
1421  	struct file *fp;
1422  	cap_rights_t rights;
1423  	int error;
1424  
1425  	if (!nvlist_exists_number(nvl, "fd") ||
1426  	    !nvlist_exists_number(nvl, "rxpda") ||
1427  	    !nvlist_exists_number(nvl, "txpda") ||
1428  	    !nvlist_exists_bool(nvl, "header_digests") ||
1429  	    !nvlist_exists_bool(nvl, "data_digests") ||
1430  	    !nvlist_exists_number(nvl, "maxr2t") ||
1431  	    !nvlist_exists_number(nvl, "maxh2cdata") ||
1432  	    !nvlist_exists_number(nvl, "max_icd"))
1433  		return (NULL);
1434  
1435  	error = fget(curthread, nvlist_get_number(nvl, "fd"),
1436  	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
1437  	if (error != 0)
1438  		return (NULL);
1439  	if (fp->f_type != DTYPE_SOCKET) {
1440  		fdrop(fp, curthread);
1441  		return (NULL);
1442  	}
1443  	so = fp->f_data;
1444  	if (so->so_type != SOCK_STREAM ||
1445  	    so->so_proto->pr_protocol != IPPROTO_TCP) {
1446  		fdrop(fp, curthread);
1447  		return (NULL);
1448  	}
1449  
1450  	/* Claim socket from file descriptor. */
1451  	fp->f_ops = &badfileops;
1452  	fp->f_data = NULL;
1453  	fdrop(fp, curthread);
1454  
1455  	qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO);
1456  	qp->so = so;
1457  	refcount_init(&qp->refs, 1);
1458  	qp->txpda = nvlist_get_number(nvl, "txpda");
1459  	qp->rxpda = nvlist_get_number(nvl, "rxpda");
1460  	qp->header_digests = nvlist_get_bool(nvl, "header_digests");
1461  	qp->data_digests = nvlist_get_bool(nvl, "data_digests");
1462  	qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
1463  	if (controller)
1464  		qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
1465  	qp->max_tx_data = tcp_max_transmit_data;
1466  	if (!controller) {
1467  		qp->max_tx_data = min(qp->max_tx_data,
1468  		    nvlist_get_number(nvl, "maxh2cdata"));
1469  		qp->max_icd = nvlist_get_number(nvl, "max_icd");
1470  	}
1471  
1472  	if (controller) {
1473  		/* Use the SUCCESS flag if SQ flow control is disabled. */
1474  		qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
1475  
1476  		/* NB: maxr2t is 0's based. */
1477  		qp->num_ttags = MIN((u_int)UINT16_MAX + 1,
1478  		    nvlist_get_number(nvl, "qsize") *
1479  		    ((uint64_t)qp->maxr2t + 1));
1480  		qp->open_ttags = mallocarray(qp->num_ttags,
1481  		    sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO);
1482  	}
1483  
1484  	TAILQ_INIT(&qp->rx_buffers.head);
1485  	TAILQ_INIT(&qp->tx_buffers.head);
1486  	mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF);
1487  	mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF);
1488  
1489  	cv_init(&qp->rx_cv, "-");
1490  	cv_init(&qp->tx_cv, "-");
1491  	mbufq_init(&qp->tx_pdus, 0);
1492  	STAILQ_INIT(&qp->tx_capsules);
1493  
1494  	/* Register socket upcalls. */
1495  	SOCKBUF_LOCK(&so->so_rcv);
1496  	soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp);
1497  	SOCKBUF_UNLOCK(&so->so_rcv);
1498  	SOCKBUF_LOCK(&so->so_snd);
1499  	soupcall_set(so, SO_SND, nvmf_soupcall_send, qp);
1500  	SOCKBUF_UNLOCK(&so->so_snd);
1501  
1502  	/* Spin up kthreads. */
1503  	error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0,
1504  	    "nvmef tcp rx");
1505  	if (error != 0) {
1506  		tcp_free_qpair(&qp->qp);
1507  		return (NULL);
1508  	}
1509  	error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0,
1510  	    "nvmef tcp tx");
1511  	if (error != 0) {
1512  		tcp_free_qpair(&qp->qp);
1513  		return (NULL);
1514  	}
1515  
1516  	return (&qp->qp);
1517  }
1518  
1519  static void
tcp_release_qpair(struct nvmf_tcp_qpair * qp)1520  tcp_release_qpair(struct nvmf_tcp_qpair *qp)
1521  {
1522  	if (refcount_release(&qp->refs))
1523  		free(qp, M_NVMF_TCP);
1524  }
1525  
1526  static void
tcp_free_qpair(struct nvmf_qpair * nq)1527  tcp_free_qpair(struct nvmf_qpair *nq)
1528  {
1529  	struct nvmf_tcp_qpair *qp = TQP(nq);
1530  	struct nvmf_tcp_command_buffer *ncb, *cb;
1531  	struct nvmf_tcp_capsule *ntc, *tc;
1532  	struct socket *so = qp->so;
1533  
1534  	/* Shut down kthreads and clear upcalls */
1535  	SOCKBUF_LOCK(&so->so_snd);
1536  	qp->tx_shutdown = true;
1537  	if (qp->tx_thread != NULL) {
1538  		cv_signal(&qp->tx_cv);
1539  		mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
1540  		    "nvtcptx", 0);
1541  	}
1542  	soupcall_clear(so, SO_SND);
1543  	SOCKBUF_UNLOCK(&so->so_snd);
1544  
1545  	SOCKBUF_LOCK(&so->so_rcv);
1546  	qp->rx_shutdown = true;
1547  	if (qp->rx_thread != NULL) {
1548  		cv_signal(&qp->rx_cv);
1549  		mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
1550  		    "nvtcprx", 0);
1551  	}
1552  	soupcall_clear(so, SO_RCV);
1553  	SOCKBUF_UNLOCK(&so->so_rcv);
1554  
1555  	STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) {
1556  		nvmf_abort_capsule_data(&tc->nc, ECONNABORTED);
1557  		tcp_release_capsule(tc);
1558  	}
1559  	mbufq_drain(&qp->tx_pdus);
1560  
1561  	cv_destroy(&qp->tx_cv);
1562  	cv_destroy(&qp->rx_cv);
1563  
1564  	if (qp->open_ttags != NULL) {
1565  		for (u_int i = 0; i < qp->num_ttags; i++) {
1566  			cb = qp->open_ttags[i];
1567  			if (cb != NULL) {
1568  				cb->tc->active_r2ts--;
1569  				cb->error = ECONNABORTED;
1570  				tcp_release_command_buffer(cb);
1571  			}
1572  		}
1573  		free(qp->open_ttags, M_NVMF_TCP);
1574  	}
1575  
1576  	mtx_lock(&qp->rx_buffers.lock);
1577  	TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
1578  		tcp_remove_command_buffer(&qp->rx_buffers, cb);
1579  		mtx_unlock(&qp->rx_buffers.lock);
1580  #ifdef INVARIANTS
1581  		if (cb->tc != NULL)
1582  			cb->tc->pending_r2ts--;
1583  #endif
1584  		cb->error = ECONNABORTED;
1585  		tcp_release_command_buffer(cb);
1586  		mtx_lock(&qp->rx_buffers.lock);
1587  	}
1588  	mtx_destroy(&qp->rx_buffers.lock);
1589  
1590  	mtx_lock(&qp->tx_buffers.lock);
1591  	TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
1592  		tcp_remove_command_buffer(&qp->tx_buffers, cb);
1593  		mtx_unlock(&qp->tx_buffers.lock);
1594  		cb->error = ECONNABORTED;
1595  		tcp_release_command_buffer(cb);
1596  		mtx_lock(&qp->tx_buffers.lock);
1597  	}
1598  	mtx_destroy(&qp->tx_buffers.lock);
1599  
1600  	soclose(so);
1601  
1602  	tcp_release_qpair(qp);
1603  }
1604  
1605  static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * nq,int how)1606  tcp_allocate_capsule(struct nvmf_qpair *nq, int how)
1607  {
1608  	struct nvmf_tcp_qpair *qp = TQP(nq);
1609  	struct nvmf_tcp_capsule *tc;
1610  
1611  	tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO);
1612  	if (tc == NULL)
1613  		return (NULL);
1614  	refcount_init(&tc->refs, 1);
1615  	refcount_acquire(&qp->refs);
1616  	return (&tc->nc);
1617  }
1618  
1619  static void
tcp_release_capsule(struct nvmf_tcp_capsule * tc)1620  tcp_release_capsule(struct nvmf_tcp_capsule *tc)
1621  {
1622  	struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair);
1623  
1624  	if (!refcount_release(&tc->refs))
1625  		return;
1626  
1627  	MPASS(tc->active_r2ts == 0);
1628  	MPASS(tc->pending_r2ts == 0);
1629  
1630  	nvmf_tcp_free_pdu(&tc->rx_pdu);
1631  	free(tc, M_NVMF_TCP);
1632  	tcp_release_qpair(qp);
1633  }
1634  
1635  static void
tcp_free_capsule(struct nvmf_capsule * nc)1636  tcp_free_capsule(struct nvmf_capsule *nc)
1637  {
1638  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1639  
1640  	tcp_release_capsule(tc);
1641  }
1642  
1643  static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1644  tcp_transmit_capsule(struct nvmf_capsule *nc)
1645  {
1646  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1647  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1648  	struct socket *so = qp->so;
1649  
1650  	refcount_acquire(&tc->refs);
1651  	SOCKBUF_LOCK(&so->so_snd);
1652  	STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link);
1653  	if (sowriteable(so))
1654  		cv_signal(&qp->tx_cv);
1655  	SOCKBUF_UNLOCK(&so->so_snd);
1656  	return (0);
1657  }
1658  
1659  static uint8_t
tcp_validate_command_capsule(struct nvmf_capsule * nc)1660  tcp_validate_command_capsule(struct nvmf_capsule *nc)
1661  {
1662  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1663  	struct nvme_sgl_descriptor *sgl;
1664  
1665  	KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
1666  
1667  	sgl = &nc->nc_sqe.sgl;
1668  	switch (sgl->type) {
1669  	case NVME_SGL_TYPE_ICD:
1670  		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1671  			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1672  			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1673  		}
1674  		break;
1675  	case NVME_SGL_TYPE_COMMAND_BUFFER:
1676  		if (tc->rx_pdu.data_len != 0) {
1677  			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1678  			return (NVME_SC_INVALID_FIELD);
1679  		}
1680  		break;
1681  	default:
1682  		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1683  		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1684  	}
1685  
1686  	if (sgl->address != 0) {
1687  		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1688  		return (NVME_SC_SGL_OFFSET_INVALID);
1689  	}
1690  
1691  	return (NVME_SC_SUCCESS);
1692  }
1693  
1694  static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1695  tcp_capsule_data_len(const struct nvmf_capsule *nc)
1696  {
1697  	MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
1698  	return (le32toh(nc->nc_sqe.sgl.length));
1699  }
1700  
1701  static void
tcp_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1702  tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
1703      struct nvmf_io_request *io)
1704  {
1705  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1706  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1707  	struct nvmf_tcp_command_buffer *cb;
1708  
1709  	cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len,
1710  	    nc->nc_sqe.cid);
1711  
1712  	cb->tc = tc;
1713  	refcount_acquire(&tc->refs);
1714  
1715  	/*
1716  	 * If this command has too many active R2Ts or there are no
1717  	 * available transfer tags, queue the request for later.
1718  	 *
1719  	 * NB: maxr2t is 0's based.
1720  	 */
1721  	mtx_lock(&qp->rx_buffers.lock);
1722  	if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) {
1723  #ifdef INVARIANTS
1724  		tc->pending_r2ts++;
1725  #endif
1726  		TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
1727  		mtx_unlock(&qp->rx_buffers.lock);
1728  		return;
1729  	}
1730  
1731  	nvmf_tcp_allocate_ttag(qp, cb);
1732  	mtx_unlock(&qp->rx_buffers.lock);
1733  
1734  	tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
1735  }
1736  
1737  static void
tcp_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1738  tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
1739      struct nvmf_io_request *io)
1740  {
1741  	struct nvmf_tcp_capsule *tc = TCAP(nc);
1742  
1743  	mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset,
1744  	    io->io_len, io, 0);
1745  	nvmf_complete_io_request(io, io->io_len, 0);
1746  }
1747  
1748  static int
tcp_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1749  tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1750      struct nvmf_io_request *io)
1751  {
1752  	struct nvme_sgl_descriptor *sgl;
1753  	size_t data_len;
1754  
1755  	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1756  	    !nc->nc_qpair->nq_controller)
1757  		return (EINVAL);
1758  
1759  	sgl = &nc->nc_sqe.sgl;
1760  	data_len = le32toh(sgl->length);
1761  	if (data_offset + io->io_len > data_len)
1762  		return (EFBIG);
1763  
1764  	if (sgl->type == NVME_SGL_TYPE_ICD)
1765  		tcp_receive_icd_data(nc, data_offset, io);
1766  	else
1767  		tcp_receive_r2t_data(nc, data_offset, io);
1768  	return (0);
1769  }
1770  
1771  /* NB: cid is little-endian already. */
1772  static void
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)1773  tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset,
1774      struct mbuf *m, size_t len, bool last_pdu, bool success)
1775  {
1776  	struct nvme_tcp_c2h_data_hdr c2h;
1777  	struct mbuf *top;
1778  
1779  	memset(&c2h, 0, sizeof(c2h));
1780  	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1781  	if (last_pdu)
1782  		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1783  	if (success)
1784  		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1785  	c2h.cccid = cid;
1786  	c2h.datao = htole32(data_offset);
1787  	c2h.datal = htole32(len);
1788  
1789  	top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
1790  	nvmf_tcp_write_pdu(qp, top);
1791  }
1792  
1793  static u_int
tcp_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)1794  tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1795      struct mbuf *m, size_t len)
1796  {
1797  	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1798  	struct nvme_sgl_descriptor *sgl;
1799  	uint32_t data_len;
1800  	bool last_pdu, last_xfer;
1801  
1802  	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1803  	    !qp->qp.nq_controller) {
1804  		m_freem(m);
1805  		return (NVME_SC_INVALID_FIELD);
1806  	}
1807  
1808  	sgl = &nc->nc_sqe.sgl;
1809  	data_len = le32toh(sgl->length);
1810  	if (data_offset + len > data_len) {
1811  		m_freem(m);
1812  		return (NVME_SC_INVALID_FIELD);
1813  	}
1814  	last_xfer = (data_offset + len == data_len);
1815  
1816  	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1817  		m_freem(m);
1818  		return (NVME_SC_INVALID_FIELD);
1819  	}
1820  
1821  	KASSERT(data_offset == TCAP(nc)->tx_data_offset,
1822  	    ("%s: starting data_offset %u doesn't match end of previous xfer %u",
1823  	    __func__, data_offset, TCAP(nc)->tx_data_offset));
1824  
1825  	/* Queue one more C2H_DATA PDUs containing the data from 'm'. */
1826  	while (m != NULL) {
1827  		struct mbuf *n;
1828  		uint32_t todo;
1829  
1830  		if (m->m_len > qp->max_tx_data) {
1831  			n = m_split(m, qp->max_tx_data, M_WAITOK);
1832  			todo = m->m_len;
1833  		} else {
1834  			struct mbuf *p;
1835  
1836  			todo = m->m_len;
1837  			p = m;
1838  			n = p->m_next;
1839  			while (n != NULL) {
1840  				if (todo + n->m_len > qp->max_tx_data) {
1841  					p->m_next = NULL;
1842  					break;
1843  				}
1844  				todo += n->m_len;
1845  				p = n;
1846  				n = p->m_next;
1847  			}
1848  			MPASS(m_length(m, NULL) == todo);
1849  		}
1850  
1851  		last_pdu = (n == NULL && last_xfer);
1852  		tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
1853  		    last_pdu, last_pdu && qp->send_success);
1854  
1855  		data_offset += todo;
1856  		data_len -= todo;
1857  		m = n;
1858  	}
1859  	MPASS(data_len == 0);
1860  
1861  #ifdef INVARIANTS
1862  	TCAP(nc)->tx_data_offset = data_offset;
1863  #endif
1864  	if (!last_xfer)
1865  		return (NVMF_MORE);
1866  	else if (qp->send_success)
1867  		return (NVMF_SUCCESS_SENT);
1868  	else
1869  		return (NVME_SC_SUCCESS);
1870  }
1871  
1872  struct nvmf_transport_ops tcp_ops = {
1873  	.allocate_qpair = tcp_allocate_qpair,
1874  	.free_qpair = tcp_free_qpair,
1875  	.allocate_capsule = tcp_allocate_capsule,
1876  	.free_capsule = tcp_free_capsule,
1877  	.transmit_capsule = tcp_transmit_capsule,
1878  	.validate_command_capsule = tcp_validate_command_capsule,
1879  	.capsule_data_len = tcp_capsule_data_len,
1880  	.receive_controller_data = tcp_receive_controller_data,
1881  	.send_controller_data = tcp_send_controller_data,
1882  	.trtype = NVMF_TRTYPE_TCP,
1883  	.priority = 0,
1884  };
1885  
1886  NVMF_TRANSPORT(tcp, tcp_ops);
1887