1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/capsicum.h>
10 #include <sys/condvar.h>
11 #include <sys/file.h>
12 #include <sys/gsb_crc32.h>
13 #include <sys/kernel.h>
14 #include <sys/kthread.h>
15 #include <sys/limits.h>
16 #include <sys/lock.h>
17 #include <sys/malloc.h>
18 #include <sys/mbuf.h>
19 #include <sys/module.h>
20 #include <sys/mutex.h>
21 #include <sys/protosw.h>
22 #include <sys/refcount.h>
23 #include <sys/socket.h>
24 #include <sys/socketvar.h>
25 #include <sys/sysctl.h>
26 #include <sys/uio.h>
27 #include <netinet/in.h>
28 #include <dev/nvme/nvme.h>
29 #include <dev/nvmf/nvmf.h>
30 #include <dev/nvmf/nvmf_proto.h>
31 #include <dev/nvmf/nvmf_tcp.h>
32 #include <dev/nvmf/nvmf_transport.h>
33 #include <dev/nvmf/nvmf_transport_internal.h>
34
35 struct nvmf_tcp_capsule;
36 struct nvmf_tcp_qpair;
37
38 struct nvmf_tcp_command_buffer {
39 struct nvmf_tcp_qpair *qp;
40
41 struct nvmf_io_request io;
42 size_t data_len;
43 size_t data_xfered;
44 uint32_t data_offset;
45
46 u_int refs;
47 int error;
48
49 uint16_t cid;
50 uint16_t ttag;
51
52 TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
53
54 /* Controller only */
55 struct nvmf_tcp_capsule *tc;
56 };
57
58 struct nvmf_tcp_command_buffer_list {
59 TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
60 struct mtx lock;
61 };
62
63 struct nvmf_tcp_qpair {
64 struct nvmf_qpair qp;
65
66 struct socket *so;
67
68 volatile u_int refs; /* Every allocated capsule holds a reference */
69 uint8_t txpda;
70 uint8_t rxpda;
71 bool header_digests;
72 bool data_digests;
73 uint32_t maxr2t;
74 uint32_t maxh2cdata; /* Controller only */
75 uint32_t max_tx_data;
76 uint32_t max_icd; /* Host only */
77 uint16_t next_ttag; /* Controller only */
78 u_int num_ttags; /* Controller only */
79 u_int active_ttags; /* Controller only */
80 bool send_success; /* Controller only */
81
82 /* Receive state. */
83 struct thread *rx_thread;
84 struct cv rx_cv;
85 bool rx_shutdown;
86
87 /* Transmit state. */
88 struct thread *tx_thread;
89 struct cv tx_cv;
90 bool tx_shutdown;
91 struct mbufq tx_pdus;
92 STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
93
94 struct nvmf_tcp_command_buffer_list tx_buffers;
95 struct nvmf_tcp_command_buffer_list rx_buffers;
96
97 /*
98 * For the controller, an RX command buffer can be in one of
99 * two locations, all protected by the rx_buffers.lock. If a
100 * receive request is waiting for either an R2T slot for its
101 * command (due to exceeding MAXR2T), or a transfer tag it is
102 * placed on the rx_buffers list. When a request is allocated
103 * an active transfer tag, it moves to the open_ttags[] array
104 * (indexed by the tag) until it completes.
105 */
106 struct nvmf_tcp_command_buffer **open_ttags; /* Controller only */
107 };
108
109 struct nvmf_tcp_rxpdu {
110 struct mbuf *m;
111 const struct nvme_tcp_common_pdu_hdr *hdr;
112 uint32_t data_len;
113 bool data_digest_mismatch;
114 };
115
116 struct nvmf_tcp_capsule {
117 struct nvmf_capsule nc;
118
119 volatile u_int refs;
120
121 struct nvmf_tcp_rxpdu rx_pdu;
122
123 uint32_t active_r2ts; /* Controller only */
124 #ifdef INVARIANTS
125 uint32_t tx_data_offset; /* Controller only */
126 u_int pending_r2ts; /* Controller only */
127 #endif
128
129 STAILQ_ENTRY(nvmf_tcp_capsule) link;
130 };
131
132 #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc))
133 #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp))
134
135 static void tcp_release_capsule(struct nvmf_tcp_capsule *tc);
136 static void tcp_free_qpair(struct nvmf_qpair *nq);
137
138 SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
139 "TCP transport");
140 static u_int tcp_max_transmit_data = 256 * 1024;
141 SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN,
142 &tcp_max_transmit_data, 0,
143 "Maximum size of data payload in a transmitted PDU");
144
145 static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
146
147 static int
mbuf_crc32c_helper(void * arg,void * data,u_int len)148 mbuf_crc32c_helper(void *arg, void *data, u_int len)
149 {
150 uint32_t *digestp = arg;
151
152 *digestp = calculate_crc32c(*digestp, data, len);
153 return (0);
154 }
155
156 static uint32_t
mbuf_crc32c(struct mbuf * m,u_int offset,u_int len)157 mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
158 {
159 uint32_t digest = 0xffffffff;
160
161 m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
162 digest = digest ^ 0xffffffff;
163
164 return (digest);
165 }
166
167 static uint32_t
compute_digest(const void * buf,size_t len)168 compute_digest(const void *buf, size_t len)
169 {
170 return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
171 }
172
173 static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)174 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
175 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
176 uint16_t cid)
177 {
178 struct nvmf_tcp_command_buffer *cb;
179
180 cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
181 cb->qp = qp;
182 cb->io = *io;
183 cb->data_offset = data_offset;
184 cb->data_len = data_len;
185 cb->data_xfered = 0;
186 refcount_init(&cb->refs, 1);
187 cb->error = 0;
188 cb->cid = cid;
189 cb->ttag = 0;
190 cb->tc = NULL;
191
192 return (cb);
193 }
194
195 static void
tcp_hold_command_buffer(struct nvmf_tcp_command_buffer * cb)196 tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
197 {
198 refcount_acquire(&cb->refs);
199 }
200
201 static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)202 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
203 {
204 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
205 if (cb->tc != NULL)
206 tcp_release_capsule(cb->tc);
207 free(cb, M_NVMF_TCP);
208 }
209
210 static void
tcp_release_command_buffer(struct nvmf_tcp_command_buffer * cb)211 tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
212 {
213 if (refcount_release(&cb->refs))
214 tcp_free_command_buffer(cb);
215 }
216
217 static void
tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list * list,struct nvmf_tcp_command_buffer * cb)218 tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
219 struct nvmf_tcp_command_buffer *cb)
220 {
221 mtx_assert(&list->lock, MA_OWNED);
222 TAILQ_INSERT_HEAD(&list->head, cb, link);
223 }
224
225 static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list * list,uint16_t cid,uint16_t ttag)226 tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
227 uint16_t cid, uint16_t ttag)
228 {
229 struct nvmf_tcp_command_buffer *cb;
230
231 mtx_assert(&list->lock, MA_OWNED);
232 TAILQ_FOREACH(cb, &list->head, link) {
233 if (cb->cid == cid && cb->ttag == ttag)
234 return (cb);
235 }
236 return (NULL);
237 }
238
239 static void
tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list * list,struct nvmf_tcp_command_buffer * cb)240 tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
241 struct nvmf_tcp_command_buffer *cb)
242 {
243 mtx_assert(&list->lock, MA_OWNED);
244 TAILQ_REMOVE(&list->head, cb, link);
245 }
246
247 static void
tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list * list,uint16_t cid,uint16_t ttag)248 tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
249 uint16_t cid, uint16_t ttag)
250 {
251 struct nvmf_tcp_command_buffer *cb;
252
253 mtx_lock(&list->lock);
254 cb = tcp_find_command_buffer(list, cid, ttag);
255 if (cb != NULL) {
256 tcp_remove_command_buffer(list, cb);
257 mtx_unlock(&list->lock);
258 tcp_release_command_buffer(cb);
259 } else
260 mtx_unlock(&list->lock);
261 }
262
263 static void
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,struct mbuf * m)264 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m)
265 {
266 struct socket *so = qp->so;
267
268 SOCKBUF_LOCK(&so->so_snd);
269 mbufq_enqueue(&qp->tx_pdus, m);
270 /* XXX: Do we need to handle sb_hiwat being wrong? */
271 if (sowriteable(so))
272 cv_signal(&qp->tx_cv);
273 SOCKBUF_UNLOCK(&so->so_snd);
274 }
275
276 static void
nvmf_tcp_report_error(struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)277 nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
278 struct mbuf *rx_pdu, u_int hlen)
279 {
280 struct nvme_tcp_term_req_hdr *hdr;
281 struct mbuf *m;
282
283 if (hlen != 0) {
284 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
285 hlen = min(hlen, m_length(rx_pdu, NULL));
286 }
287
288 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
289 m->m_len = sizeof(*hdr) + hlen;
290 hdr = mtod(m, void *);
291 memset(hdr, 0, sizeof(*hdr));
292 hdr->common.pdu_type = qp->qp.nq_controller ?
293 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
294 hdr->common.hlen = sizeof(*hdr);
295 hdr->common.plen = sizeof(*hdr) + hlen;
296 hdr->fes = htole16(fes);
297 le32enc(hdr->fei, fei);
298 if (hlen != 0)
299 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
300
301 nvmf_tcp_write_pdu(qp, m);
302 }
303
304 static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)305 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
306 {
307 const struct nvme_tcp_common_pdu_hdr *ch;
308 struct mbuf *m = pdu->m;
309 uint32_t data_len, fei, plen;
310 uint32_t digest, rx_digest;
311 u_int hlen;
312 int error;
313 uint16_t fes;
314
315 /* Determine how large of a PDU header to return for errors. */
316 ch = pdu->hdr;
317 hlen = ch->hlen;
318 plen = le32toh(ch->plen);
319 if (hlen < sizeof(*ch) || hlen > plen)
320 hlen = sizeof(*ch);
321
322 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
323 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
324 &fei);
325 if (error != 0) {
326 if (error != ECONNRESET)
327 nvmf_tcp_report_error(qp, fes, fei, m, hlen);
328 return (error);
329 }
330
331 /* Check header digest if present. */
332 if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
333 digest = mbuf_crc32c(m, 0, ch->hlen);
334 m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
335 if (digest != rx_digest) {
336 printf("NVMe/TCP: Header digest mismatch\n");
337 nvmf_tcp_report_error(qp,
338 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
339 hlen);
340 return (EBADMSG);
341 }
342 }
343
344 /* Check data digest if present. */
345 pdu->data_digest_mismatch = false;
346 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
347 digest = mbuf_crc32c(m, ch->pdo, data_len);
348 m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
349 (caddr_t)&rx_digest);
350 if (digest != rx_digest) {
351 printf("NVMe/TCP: Data digest mismatch\n");
352 pdu->data_digest_mismatch = true;
353 }
354 }
355
356 pdu->data_len = data_len;
357 return (0);
358 }
359
360 static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)361 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
362 {
363 m_freem(pdu->m);
364 pdu->m = NULL;
365 pdu->hdr = NULL;
366 }
367
368 static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)369 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
370 {
371 const struct nvme_tcp_term_req_hdr *hdr;
372
373 hdr = (const void *)pdu->hdr;
374
375 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
376 le16toh(hdr->fes), le32dec(hdr->fei));
377 nvmf_tcp_free_pdu(pdu);
378 return (ECONNRESET);
379 }
380
381 static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)382 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
383 struct nvmf_tcp_rxpdu *pdu)
384 {
385 const struct nvme_tcp_cmd *cmd;
386 struct nvmf_capsule *nc;
387 struct nvmf_tcp_capsule *tc;
388
389 cmd = (const void *)pdu->hdr;
390
391 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
392
393 tc = TCAP(nc);
394 tc->rx_pdu = *pdu;
395
396 nvmf_capsule_received(&qp->qp, nc);
397 return (0);
398 }
399
400 static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)401 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
402 struct nvmf_tcp_rxpdu *pdu)
403 {
404 const struct nvme_tcp_rsp *rsp;
405 struct nvmf_capsule *nc;
406 struct nvmf_tcp_capsule *tc;
407
408 rsp = (const void *)pdu->hdr;
409
410 nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
411
412 nc->nc_sqhd_valid = true;
413 tc = TCAP(nc);
414 tc->rx_pdu = *pdu;
415
416 /*
417 * Once the CQE has been received, no further transfers to the
418 * command buffer for the associated CID can occur.
419 */
420 tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
421 tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
422
423 nvmf_capsule_received(&qp->qp, nc);
424 return (0);
425 }
426
427 /*
428 * Construct a PDU that contains an optional data payload. This
429 * includes dealing with digests and the length fields in the common
430 * header.
431 */
432 static struct mbuf *
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)433 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
434 struct mbuf *data, uint32_t data_len)
435 {
436 struct nvme_tcp_common_pdu_hdr *ch;
437 struct mbuf *top;
438 uint32_t digest, pad, pdo, plen, mlen;
439
440 plen = hlen;
441 if (qp->header_digests)
442 plen += sizeof(digest);
443 if (data_len != 0) {
444 KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
445 pdo = roundup(plen, qp->txpda);
446 pad = pdo - plen;
447 plen = pdo + data_len;
448 if (qp->data_digests)
449 plen += sizeof(digest);
450 mlen = pdo;
451 } else {
452 KASSERT(data == NULL, ("payload mbuf with zero length"));
453 pdo = 0;
454 pad = 0;
455 mlen = plen;
456 }
457
458 top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
459 top->m_len = mlen;
460 ch = mtod(top, void *);
461 memcpy(ch, hdr, hlen);
462 ch->hlen = hlen;
463 if (qp->header_digests)
464 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
465 if (qp->data_digests && data_len != 0)
466 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
467 ch->pdo = pdo;
468 ch->plen = htole32(plen);
469
470 /* HDGST */
471 if (qp->header_digests) {
472 digest = compute_digest(ch, hlen);
473 memcpy((char *)ch + hlen, &digest, sizeof(digest));
474 }
475
476 if (pad != 0) {
477 /* PAD */
478 memset((char *)ch + pdo - pad, 0, pad);
479 }
480
481 if (data_len != 0) {
482 /* DATA */
483 top->m_next = data;
484
485 /* DDGST */
486 if (qp->data_digests) {
487 digest = mbuf_crc32c(data, 0, data_len);
488
489 /* XXX: Can't use m_append as it uses M_NOWAIT. */
490 while (data->m_next != NULL)
491 data = data->m_next;
492
493 data->m_next = m_get(M_WAITOK, MT_DATA);
494 data->m_next->m_len = sizeof(digest);
495 memcpy(mtod(data->m_next, void *), &digest,
496 sizeof(digest));
497 }
498 }
499
500 return (top);
501 }
502
503 /* Find the next command buffer eligible to schedule for R2T. */
504 static struct nvmf_tcp_command_buffer *
nvmf_tcp_next_r2t(struct nvmf_tcp_qpair * qp)505 nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
506 {
507 struct nvmf_tcp_command_buffer *cb;
508
509 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
510 MPASS(qp->active_ttags < qp->num_ttags);
511
512 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
513 /* NB: maxr2t is 0's based. */
514 if (cb->tc->active_r2ts > qp->maxr2t)
515 continue;
516 #ifdef INVARIANTS
517 cb->tc->pending_r2ts--;
518 #endif
519 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
520 return (cb);
521 }
522 return (NULL);
523 }
524
525 /* Allocate the next free transfer tag and assign it to cb. */
526 static void
nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_command_buffer * cb)527 nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
528 struct nvmf_tcp_command_buffer *cb)
529 {
530 uint16_t ttag;
531
532 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
533
534 ttag = qp->next_ttag;
535 for (;;) {
536 if (qp->open_ttags[ttag] == NULL)
537 break;
538 if (ttag == qp->num_ttags - 1)
539 ttag = 0;
540 else
541 ttag++;
542 MPASS(ttag != qp->next_ttag);
543 }
544 if (ttag == qp->num_ttags - 1)
545 qp->next_ttag = 0;
546 else
547 qp->next_ttag = ttag + 1;
548
549 cb->tc->active_r2ts++;
550 qp->active_ttags++;
551 qp->open_ttags[ttag] = cb;
552
553 /*
554 * Don't bother byte-swapping ttag as it is just a cookie
555 * value returned by the other end as-is.
556 */
557 cb->ttag = ttag;
558 }
559
560 /* NB: cid and ttag are both little-endian already. */
561 static void
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)562 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
563 uint32_t data_offset, uint32_t data_len)
564 {
565 struct nvme_tcp_r2t_hdr r2t;
566 struct mbuf *m;
567
568 memset(&r2t, 0, sizeof(r2t));
569 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
570 r2t.cccid = cid;
571 r2t.ttag = ttag;
572 r2t.r2to = htole32(data_offset);
573 r2t.r2tl = htole32(data_len);
574
575 m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
576 nvmf_tcp_write_pdu(qp, m);
577 }
578
579 /*
580 * Release a transfer tag and schedule another R2T.
581 *
582 * NB: This drops the rx_buffers.lock mutex.
583 */
584 static void
nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_command_buffer * cb)585 nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
586 struct nvmf_tcp_command_buffer *cb)
587 {
588 struct nvmf_tcp_command_buffer *ncb;
589
590 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
591 MPASS(qp->open_ttags[cb->ttag] == cb);
592
593 /* Release this transfer tag. */
594 qp->open_ttags[cb->ttag] = NULL;
595 qp->active_ttags--;
596 cb->tc->active_r2ts--;
597
598 /* Schedule another R2T. */
599 ncb = nvmf_tcp_next_r2t(qp);
600 if (ncb != NULL) {
601 nvmf_tcp_allocate_ttag(qp, ncb);
602 mtx_unlock(&qp->rx_buffers.lock);
603 tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
604 ncb->data_len);
605 } else
606 mtx_unlock(&qp->rx_buffers.lock);
607 }
608
609 /*
610 * Copy len bytes starting at offset skip from an mbuf chain into an
611 * I/O buffer at destination offset io_offset.
612 */
613 static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)614 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
615 struct nvmf_io_request *io, u_int io_offset)
616 {
617 u_int todo;
618
619 while (m->m_len <= skip) {
620 skip -= m->m_len;
621 m = m->m_next;
622 }
623 while (len != 0) {
624 MPASS((m->m_flags & M_EXTPG) == 0);
625
626 todo = min(m->m_len - skip, len);
627 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
628 skip = 0;
629 io_offset += todo;
630 len -= todo;
631 m = m->m_next;
632 }
633 }
634
635 static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)636 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
637 {
638 const struct nvme_tcp_h2c_data_hdr *h2c;
639 struct nvmf_tcp_command_buffer *cb;
640 uint32_t data_len, data_offset;
641 uint16_t ttag;
642
643 h2c = (const void *)pdu->hdr;
644 if (le32toh(h2c->datal) > qp->maxh2cdata) {
645 nvmf_tcp_report_error(qp,
646 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
647 pdu->m, pdu->hdr->hlen);
648 nvmf_tcp_free_pdu(pdu);
649 return (EBADMSG);
650 }
651
652 /*
653 * NB: Don't bother byte-swapping ttag as we don't byte-swap
654 * it when sending.
655 */
656 ttag = h2c->ttag;
657 if (ttag >= qp->num_ttags) {
658 nvmf_tcp_report_error(qp,
659 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
660 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
661 pdu->hdr->hlen);
662 nvmf_tcp_free_pdu(pdu);
663 return (EBADMSG);
664 }
665
666 mtx_lock(&qp->rx_buffers.lock);
667 cb = qp->open_ttags[ttag];
668 if (cb == NULL) {
669 mtx_unlock(&qp->rx_buffers.lock);
670 nvmf_tcp_report_error(qp,
671 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
672 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
673 pdu->hdr->hlen);
674 nvmf_tcp_free_pdu(pdu);
675 return (EBADMSG);
676 }
677 MPASS(cb->ttag == ttag);
678
679 /* For a data digest mismatch, fail the I/O request. */
680 if (pdu->data_digest_mismatch) {
681 nvmf_tcp_send_next_r2t(qp, cb);
682 cb->error = EINTEGRITY;
683 tcp_release_command_buffer(cb);
684 nvmf_tcp_free_pdu(pdu);
685 return (0);
686 }
687
688 data_len = le32toh(h2c->datal);
689 if (data_len != pdu->data_len) {
690 mtx_unlock(&qp->rx_buffers.lock);
691 nvmf_tcp_report_error(qp,
692 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
693 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
694 pdu->hdr->hlen);
695 nvmf_tcp_free_pdu(pdu);
696 return (EBADMSG);
697 }
698
699 data_offset = le32toh(h2c->datao);
700 if (data_offset < cb->data_offset ||
701 data_offset + data_len > cb->data_offset + cb->data_len) {
702 mtx_unlock(&qp->rx_buffers.lock);
703 nvmf_tcp_report_error(qp,
704 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
705 pdu->hdr->hlen);
706 nvmf_tcp_free_pdu(pdu);
707 return (EBADMSG);
708 }
709
710 if (data_offset != cb->data_offset + cb->data_xfered) {
711 mtx_unlock(&qp->rx_buffers.lock);
712 nvmf_tcp_report_error(qp,
713 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
714 pdu->hdr->hlen);
715 nvmf_tcp_free_pdu(pdu);
716 return (EBADMSG);
717 }
718
719 if ((cb->data_xfered + data_len == cb->data_len) !=
720 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
721 mtx_unlock(&qp->rx_buffers.lock);
722 nvmf_tcp_report_error(qp,
723 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
724 pdu->hdr->hlen);
725 nvmf_tcp_free_pdu(pdu);
726 return (EBADMSG);
727 }
728
729 cb->data_xfered += data_len;
730 data_offset -= cb->data_offset;
731 if (cb->data_xfered == cb->data_len) {
732 nvmf_tcp_send_next_r2t(qp, cb);
733 } else {
734 tcp_hold_command_buffer(cb);
735 mtx_unlock(&qp->rx_buffers.lock);
736 }
737
738 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
739
740 tcp_release_command_buffer(cb);
741 nvmf_tcp_free_pdu(pdu);
742 return (0);
743 }
744
745 static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)746 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
747 {
748 const struct nvme_tcp_c2h_data_hdr *c2h;
749 struct nvmf_tcp_command_buffer *cb;
750 uint32_t data_len, data_offset;
751
752 c2h = (const void *)pdu->hdr;
753
754 mtx_lock(&qp->rx_buffers.lock);
755 cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
756 if (cb == NULL) {
757 mtx_unlock(&qp->rx_buffers.lock);
758 /*
759 * XXX: Could be PDU sequence error if cccid is for a
760 * command that doesn't use a command buffer.
761 */
762 nvmf_tcp_report_error(qp,
763 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
764 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
765 pdu->hdr->hlen);
766 nvmf_tcp_free_pdu(pdu);
767 return (EBADMSG);
768 }
769
770 /* For a data digest mismatch, fail the I/O request. */
771 if (pdu->data_digest_mismatch) {
772 cb->error = EINTEGRITY;
773 tcp_remove_command_buffer(&qp->rx_buffers, cb);
774 mtx_unlock(&qp->rx_buffers.lock);
775 tcp_release_command_buffer(cb);
776 nvmf_tcp_free_pdu(pdu);
777 return (0);
778 }
779
780 data_len = le32toh(c2h->datal);
781 if (data_len != pdu->data_len) {
782 mtx_unlock(&qp->rx_buffers.lock);
783 nvmf_tcp_report_error(qp,
784 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
785 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
786 pdu->hdr->hlen);
787 nvmf_tcp_free_pdu(pdu);
788 return (EBADMSG);
789 }
790
791 data_offset = le32toh(c2h->datao);
792 if (data_offset < cb->data_offset ||
793 data_offset + data_len > cb->data_offset + cb->data_len) {
794 mtx_unlock(&qp->rx_buffers.lock);
795 nvmf_tcp_report_error(qp,
796 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
797 pdu->m, pdu->hdr->hlen);
798 nvmf_tcp_free_pdu(pdu);
799 return (EBADMSG);
800 }
801
802 if (data_offset != cb->data_offset + cb->data_xfered) {
803 mtx_unlock(&qp->rx_buffers.lock);
804 nvmf_tcp_report_error(qp,
805 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
806 pdu->hdr->hlen);
807 nvmf_tcp_free_pdu(pdu);
808 return (EBADMSG);
809 }
810
811 if ((cb->data_xfered + data_len == cb->data_len) !=
812 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
813 mtx_unlock(&qp->rx_buffers.lock);
814 nvmf_tcp_report_error(qp,
815 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
816 pdu->hdr->hlen);
817 nvmf_tcp_free_pdu(pdu);
818 return (EBADMSG);
819 }
820
821 cb->data_xfered += data_len;
822 data_offset -= cb->data_offset;
823 if (cb->data_xfered == cb->data_len)
824 tcp_remove_command_buffer(&qp->rx_buffers, cb);
825 else
826 tcp_hold_command_buffer(cb);
827 mtx_unlock(&qp->rx_buffers.lock);
828
829 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
830
831 tcp_release_command_buffer(cb);
832
833 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
834 struct nvme_completion cqe;
835 struct nvmf_capsule *nc;
836
837 memset(&cqe, 0, sizeof(cqe));
838 cqe.cid = c2h->cccid;
839
840 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
841 nc->nc_sqhd_valid = false;
842
843 nvmf_capsule_received(&qp->qp, nc);
844 }
845
846 nvmf_tcp_free_pdu(pdu);
847 return (0);
848 }
849
850 /* Called when m_free drops refcount to 0. */
851 static void
nvmf_tcp_mbuf_done(struct mbuf * m)852 nvmf_tcp_mbuf_done(struct mbuf *m)
853 {
854 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
855
856 tcp_free_command_buffer(cb);
857 }
858
859 static struct mbuf *
nvmf_tcp_mbuf(void * arg,int how,void * data,size_t len)860 nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len)
861 {
862 struct nvmf_tcp_command_buffer *cb = arg;
863 struct mbuf *m;
864
865 m = m_get(how, MT_DATA);
866 m->m_flags |= M_RDONLY;
867 m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL);
868 m->m_len = len;
869 return (m);
870 }
871
872 static void
nvmf_tcp_free_mext_pg(struct mbuf * m)873 nvmf_tcp_free_mext_pg(struct mbuf *m)
874 {
875 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
876
877 M_ASSERTEXTPG(m);
878 tcp_release_command_buffer(cb);
879 }
880
881 static struct mbuf *
nvmf_tcp_mext_pg(void * arg,int how)882 nvmf_tcp_mext_pg(void *arg, int how)
883 {
884 struct nvmf_tcp_command_buffer *cb = arg;
885 struct mbuf *m;
886
887 m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY);
888 m->m_ext.ext_arg1 = cb;
889 tcp_hold_command_buffer(cb);
890 return (m);
891 }
892
893 /*
894 * Return an mbuf chain for a range of data belonging to a command
895 * buffer.
896 *
897 * The mbuf chain uses M_EXT mbufs which hold references on the
898 * command buffer so that it remains "alive" until the data has been
899 * fully transmitted. If truncate_ok is true, then the mbuf chain
900 * might return a short chain to avoid gratuitously splitting up a
901 * page.
902 */
903 static struct mbuf *
nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)904 nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb,
905 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
906 bool can_truncate)
907 {
908 struct mbuf *m;
909 size_t len;
910
911 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf,
912 nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
913 can_truncate);
914 if (actual_len != NULL)
915 *actual_len = len;
916 return (m);
917 }
918
919 /* NB: cid and ttag and little-endian already. */
920 static void
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)921 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
922 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
923 {
924 struct nvme_tcp_h2c_data_hdr h2c;
925 struct mbuf *top;
926
927 memset(&h2c, 0, sizeof(h2c));
928 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
929 if (last_pdu)
930 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
931 h2c.cccid = cid;
932 h2c.ttag = ttag;
933 h2c.datao = htole32(data_offset);
934 h2c.datal = htole32(len);
935
936 top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
937 nvmf_tcp_write_pdu(qp, top);
938 }
939
940 static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)941 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
942 {
943 const struct nvme_tcp_r2t_hdr *r2t;
944 struct nvmf_tcp_command_buffer *cb;
945 uint32_t data_len, data_offset;
946
947 r2t = (const void *)pdu->hdr;
948
949 mtx_lock(&qp->tx_buffers.lock);
950 cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0);
951 if (cb == NULL) {
952 mtx_unlock(&qp->tx_buffers.lock);
953 nvmf_tcp_report_error(qp,
954 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
955 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
956 pdu->hdr->hlen);
957 nvmf_tcp_free_pdu(pdu);
958 return (EBADMSG);
959 }
960
961 data_offset = le32toh(r2t->r2to);
962 if (data_offset != cb->data_xfered) {
963 mtx_unlock(&qp->tx_buffers.lock);
964 nvmf_tcp_report_error(qp,
965 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
966 pdu->hdr->hlen);
967 nvmf_tcp_free_pdu(pdu);
968 return (EBADMSG);
969 }
970
971 /*
972 * XXX: The spec does not specify how to handle R2T tranfers
973 * out of range of the original command.
974 */
975 data_len = le32toh(r2t->r2tl);
976 if (data_offset + data_len > cb->data_len) {
977 mtx_unlock(&qp->tx_buffers.lock);
978 nvmf_tcp_report_error(qp,
979 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
980 pdu->m, pdu->hdr->hlen);
981 nvmf_tcp_free_pdu(pdu);
982 return (EBADMSG);
983 }
984
985 cb->data_xfered += data_len;
986 if (cb->data_xfered == cb->data_len)
987 tcp_remove_command_buffer(&qp->tx_buffers, cb);
988 else
989 tcp_hold_command_buffer(cb);
990 mtx_unlock(&qp->tx_buffers.lock);
991
992 /*
993 * Queue one or more H2C_DATA PDUs containing the requested
994 * data.
995 */
996 while (data_len > 0) {
997 struct mbuf *m;
998 uint32_t sent, todo;
999
1000 todo = min(data_len, qp->max_tx_data);
1001 m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent,
1002 todo < data_len);
1003 tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1004 sent, sent == data_len);
1005
1006 data_offset += sent;
1007 data_len -= sent;
1008 }
1009
1010 tcp_release_command_buffer(cb);
1011 nvmf_tcp_free_pdu(pdu);
1012 return (0);
1013 }
1014
1015 /*
1016 * A variant of m_pullup that uses M_WAITOK instead of failing. It
1017 * also doesn't do anything if enough bytes are already present in the
1018 * first mbuf.
1019 */
1020 static struct mbuf *
pullup_pdu_hdr(struct mbuf * m,int len)1021 pullup_pdu_hdr(struct mbuf *m, int len)
1022 {
1023 struct mbuf *n, *p;
1024
1025 KASSERT(len <= MCLBYTES, ("%s: len too large", __func__));
1026 if (m->m_len >= len)
1027 return (m);
1028
1029 n = m_get2(len, M_WAITOK, MT_DATA, 0);
1030 n->m_len = len;
1031 m_copydata(m, 0, len, mtod(n, void *));
1032
1033 while (m != NULL && m->m_len <= len) {
1034 p = m->m_next;
1035 len -= m->m_len;
1036 m_free(m);
1037 m = p;
1038 }
1039 if (len > 0) {
1040 m->m_data += len;
1041 m->m_len -= len;
1042 }
1043 n->m_next = m;
1044 return (n);
1045 }
1046
1047 static int
nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,struct nvmf_tcp_rxpdu * pdu)1048 nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp,
1049 const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu)
1050 {
1051 /* Ensure the PDU header is contiguous. */
1052 pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen);
1053 pdu->hdr = mtod(pdu->m, const void *);
1054
1055 switch (ch->pdu_type) {
1056 default:
1057 __assert_unreachable();
1058 break;
1059 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1060 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1061 return (nvmf_tcp_handle_term_req(pdu));
1062 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1063 return (nvmf_tcp_save_command_capsule(qp, pdu));
1064 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1065 return (nvmf_tcp_save_response_capsule(qp, pdu));
1066 case NVME_TCP_PDU_TYPE_H2C_DATA:
1067 return (nvmf_tcp_handle_h2c_data(qp, pdu));
1068 case NVME_TCP_PDU_TYPE_C2H_DATA:
1069 return (nvmf_tcp_handle_c2h_data(qp, pdu));
1070 case NVME_TCP_PDU_TYPE_R2T:
1071 return (nvmf_tcp_handle_r2t(qp, pdu));
1072 }
1073 }
1074
1075 static void
nvmf_tcp_receive(void * arg)1076 nvmf_tcp_receive(void *arg)
1077 {
1078 struct nvmf_tcp_qpair *qp = arg;
1079 struct socket *so = qp->so;
1080 struct nvmf_tcp_rxpdu pdu;
1081 struct nvme_tcp_common_pdu_hdr ch;
1082 struct uio uio;
1083 struct iovec iov[1];
1084 struct mbuf *m, *n, *tail;
1085 u_int avail, needed;
1086 int error, flags, terror;
1087 bool have_header;
1088
1089 m = tail = NULL;
1090 have_header = false;
1091 SOCKBUF_LOCK(&so->so_rcv);
1092 while (!qp->rx_shutdown) {
1093 /* Wait until there is enough data for the next step. */
1094 if (so->so_error != 0 || so->so_rerror != 0) {
1095 if (so->so_error != 0)
1096 error = so->so_error;
1097 else
1098 error = so->so_rerror;
1099 SOCKBUF_UNLOCK(&so->so_rcv);
1100 error:
1101 m_freem(m);
1102 nvmf_qpair_error(&qp->qp, error);
1103 SOCKBUF_LOCK(&so->so_rcv);
1104 while (!qp->rx_shutdown)
1105 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1106 break;
1107 }
1108 avail = sbavail(&so->so_rcv);
1109 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1110 if (!have_header && avail == 0)
1111 error = 0;
1112 else
1113 error = ECONNRESET;
1114 SOCKBUF_UNLOCK(&so->so_rcv);
1115 goto error;
1116 }
1117 if (avail == 0 || (!have_header && avail < sizeof(ch))) {
1118 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1119 continue;
1120 }
1121 SOCKBUF_UNLOCK(&so->so_rcv);
1122
1123 if (!have_header) {
1124 KASSERT(m == NULL, ("%s: m != NULL but no header",
1125 __func__));
1126 memset(&uio, 0, sizeof(uio));
1127 iov[0].iov_base = &ch;
1128 iov[0].iov_len = sizeof(ch);
1129 uio.uio_iov = iov;
1130 uio.uio_iovcnt = 1;
1131 uio.uio_resid = sizeof(ch);
1132 uio.uio_segflg = UIO_SYSSPACE;
1133 uio.uio_rw = UIO_READ;
1134 flags = MSG_DONTWAIT | MSG_PEEK;
1135
1136 error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
1137 if (error != 0)
1138 goto error;
1139 KASSERT(uio.uio_resid == 0, ("%s: short CH read",
1140 __func__));
1141
1142 have_header = true;
1143 needed = le32toh(ch.plen);
1144
1145 /*
1146 * Malformed PDUs will be reported as errors
1147 * by nvmf_tcp_validate_pdu. Just pass along
1148 * garbage headers if the lengths mismatch.
1149 */
1150 if (needed < sizeof(ch) || ch.hlen > needed)
1151 needed = sizeof(ch);
1152
1153 memset(&uio, 0, sizeof(uio));
1154 uio.uio_resid = needed;
1155 }
1156
1157 flags = MSG_DONTWAIT;
1158 error = soreceive(so, NULL, &uio, &n, NULL, &flags);
1159 if (error != 0)
1160 goto error;
1161
1162 if (m == NULL)
1163 m = n;
1164 else
1165 tail->m_next = n;
1166
1167 if (uio.uio_resid != 0) {
1168 tail = n;
1169 while (tail->m_next != NULL)
1170 tail = tail->m_next;
1171
1172 SOCKBUF_LOCK(&so->so_rcv);
1173 continue;
1174 }
1175 #ifdef INVARIANTS
1176 tail = NULL;
1177 #endif
1178
1179 pdu.m = m;
1180 m = NULL;
1181 pdu.hdr = &ch;
1182 error = nvmf_tcp_validate_pdu(qp, &pdu);
1183 if (error != 0)
1184 m_freem(pdu.m);
1185 else
1186 error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu);
1187 if (error != 0) {
1188 /*
1189 * If we received a termination request, close
1190 * the connection immediately.
1191 */
1192 if (error == ECONNRESET)
1193 goto error;
1194
1195 /*
1196 * Wait for up to 30 seconds for the socket to
1197 * be closed by the other end.
1198 */
1199 SOCKBUF_LOCK(&so->so_rcv);
1200 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1201 terror = cv_timedwait(&qp->rx_cv,
1202 SOCKBUF_MTX(&so->so_rcv), 30 * hz);
1203 if (terror == ETIMEDOUT)
1204 printf("NVMe/TCP: Timed out after sending terminate request\n");
1205 }
1206 SOCKBUF_UNLOCK(&so->so_rcv);
1207 goto error;
1208 }
1209
1210 have_header = false;
1211 SOCKBUF_LOCK(&so->so_rcv);
1212 }
1213 SOCKBUF_UNLOCK(&so->so_rcv);
1214 kthread_exit();
1215 }
1216
1217 static struct mbuf *
tcp_command_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1218 tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1219 {
1220 struct nvmf_capsule *nc = &tc->nc;
1221 struct nvmf_tcp_command_buffer *cb;
1222 struct nvme_sgl_descriptor *sgl;
1223 struct nvme_tcp_cmd cmd;
1224 struct mbuf *top, *m;
1225 bool use_icd;
1226
1227 use_icd = false;
1228 cb = NULL;
1229 m = NULL;
1230
1231 if (nc->nc_data.io_len != 0) {
1232 cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0,
1233 nc->nc_data.io_len, nc->nc_sqe.cid);
1234
1235 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
1236 use_icd = true;
1237 m = nvmf_tcp_command_buffer_mbuf(cb, 0,
1238 nc->nc_data.io_len, NULL, false);
1239 cb->data_xfered = nc->nc_data.io_len;
1240 tcp_release_command_buffer(cb);
1241 } else if (nc->nc_send_data) {
1242 mtx_lock(&qp->tx_buffers.lock);
1243 tcp_add_command_buffer(&qp->tx_buffers, cb);
1244 mtx_unlock(&qp->tx_buffers.lock);
1245 } else {
1246 mtx_lock(&qp->rx_buffers.lock);
1247 tcp_add_command_buffer(&qp->rx_buffers, cb);
1248 mtx_unlock(&qp->rx_buffers.lock);
1249 }
1250 }
1251
1252 memset(&cmd, 0, sizeof(cmd));
1253 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1254 cmd.ccsqe = nc->nc_sqe;
1255
1256 /* Populate SGL in SQE. */
1257 sgl = &cmd.ccsqe.sgl;
1258 memset(sgl, 0, sizeof(*sgl));
1259 sgl->address = 0;
1260 sgl->length = htole32(nc->nc_data.io_len);
1261 if (use_icd) {
1262 /* Use in-capsule data. */
1263 sgl->type = NVME_SGL_TYPE_ICD;
1264 } else {
1265 /* Use a command buffer. */
1266 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1267 }
1268
1269 top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
1270 nc->nc_data.io_len : 0);
1271 return (top);
1272 }
1273
1274 static struct mbuf *
tcp_response_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1275 tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1276 {
1277 struct nvmf_capsule *nc = &tc->nc;
1278 struct nvme_tcp_rsp rsp;
1279
1280 memset(&rsp, 0, sizeof(rsp));
1281 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1282 rsp.rccqe = nc->nc_cqe;
1283
1284 return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1285 }
1286
1287 static struct mbuf *
capsule_to_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1288 capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1289 {
1290 if (tc->nc.nc_qe_len == sizeof(struct nvme_command))
1291 return (tcp_command_pdu(qp, tc));
1292 else
1293 return (tcp_response_pdu(qp, tc));
1294 }
1295
1296 static void
nvmf_tcp_send(void * arg)1297 nvmf_tcp_send(void *arg)
1298 {
1299 struct nvmf_tcp_qpair *qp = arg;
1300 struct nvmf_tcp_capsule *tc;
1301 struct socket *so = qp->so;
1302 struct mbuf *m, *n, *p;
1303 u_long space, tosend;
1304 int error;
1305
1306 m = NULL;
1307 SOCKBUF_LOCK(&so->so_snd);
1308 while (!qp->tx_shutdown) {
1309 if (so->so_error != 0) {
1310 error = so->so_error;
1311 SOCKBUF_UNLOCK(&so->so_snd);
1312 error:
1313 m_freem(m);
1314 nvmf_qpair_error(&qp->qp, error);
1315 SOCKBUF_LOCK(&so->so_snd);
1316 while (!qp->tx_shutdown)
1317 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1318 break;
1319 }
1320
1321 if (m == NULL) {
1322 /* Next PDU to send. */
1323 m = mbufq_dequeue(&qp->tx_pdus);
1324 }
1325 if (m == NULL) {
1326 if (STAILQ_EMPTY(&qp->tx_capsules)) {
1327 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1328 continue;
1329 }
1330
1331 /* Convert a capsule into a PDU. */
1332 tc = STAILQ_FIRST(&qp->tx_capsules);
1333 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
1334 SOCKBUF_UNLOCK(&so->so_snd);
1335
1336 n = capsule_to_pdu(qp, tc);
1337 tcp_release_capsule(tc);
1338
1339 SOCKBUF_LOCK(&so->so_snd);
1340 mbufq_enqueue(&qp->tx_pdus, n);
1341 continue;
1342 }
1343
1344 /*
1345 * Wait until there is enough room to send some data.
1346 * If the socket buffer is empty, always send at least
1347 * something.
1348 */
1349 space = sbspace(&so->so_snd);
1350 if (space < m->m_len && sbused(&so->so_snd) != 0) {
1351 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1352 continue;
1353 }
1354 SOCKBUF_UNLOCK(&so->so_snd);
1355
1356 /*
1357 * If 'm' is too big, then the socket buffer must be
1358 * empty. Split 'm' to make at least some forward
1359 * progress.
1360 *
1361 * Otherwise, chain up as many pending mbufs from 'm'
1362 * that will fit.
1363 */
1364 if (m->m_len > space) {
1365 n = m_split(m, space, M_WAITOK);
1366 } else {
1367 tosend = m->m_len;
1368 n = m->m_next;
1369 p = m;
1370 while (n != NULL && tosend + n->m_len <= space) {
1371 tosend += n->m_len;
1372 p = n;
1373 n = n->m_next;
1374 }
1375 KASSERT(p->m_next == n, ("%s: p not before n",
1376 __func__));
1377 p->m_next = NULL;
1378
1379 KASSERT(m_length(m, NULL) == tosend,
1380 ("%s: length mismatch", __func__));
1381 }
1382 error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL);
1383 if (error != 0) {
1384 m = NULL;
1385 m_freem(n);
1386 goto error;
1387 }
1388 m = n;
1389 SOCKBUF_LOCK(&so->so_snd);
1390 }
1391 SOCKBUF_UNLOCK(&so->so_snd);
1392 kthread_exit();
1393 }
1394
1395 static int
nvmf_soupcall_receive(struct socket * so,void * arg,int waitflag)1396 nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag)
1397 {
1398 struct nvmf_tcp_qpair *qp = arg;
1399
1400 if (soreadable(so))
1401 cv_signal(&qp->rx_cv);
1402 return (SU_OK);
1403 }
1404
1405 static int
nvmf_soupcall_send(struct socket * so,void * arg,int waitflag)1406 nvmf_soupcall_send(struct socket *so, void *arg, int waitflag)
1407 {
1408 struct nvmf_tcp_qpair *qp = arg;
1409
1410 if (sowriteable(so))
1411 cv_signal(&qp->tx_cv);
1412 return (SU_OK);
1413 }
1414
1415 static struct nvmf_qpair *
tcp_allocate_qpair(bool controller,const struct nvmf_handoff_qpair_params * params)1416 tcp_allocate_qpair(bool controller,
1417 const struct nvmf_handoff_qpair_params *params)
1418 {
1419 struct nvmf_tcp_qpair *qp;
1420 struct socket *so;
1421 struct file *fp;
1422 cap_rights_t rights;
1423 int error;
1424
1425 error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights,
1426 CAP_SOCK_CLIENT), &fp);
1427 if (error != 0)
1428 return (NULL);
1429 if (fp->f_type != DTYPE_SOCKET) {
1430 fdrop(fp, curthread);
1431 return (NULL);
1432 }
1433 so = fp->f_data;
1434 if (so->so_type != SOCK_STREAM ||
1435 so->so_proto->pr_protocol != IPPROTO_TCP) {
1436 fdrop(fp, curthread);
1437 return (NULL);
1438 }
1439
1440 /* Claim socket from file descriptor. */
1441 fp->f_ops = &badfileops;
1442 fp->f_data = NULL;
1443 fdrop(fp, curthread);
1444
1445 qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO);
1446 qp->so = so;
1447 refcount_init(&qp->refs, 1);
1448 qp->txpda = params->tcp.txpda;
1449 qp->rxpda = params->tcp.rxpda;
1450 qp->header_digests = params->tcp.header_digests;
1451 qp->data_digests = params->tcp.data_digests;
1452 qp->maxr2t = params->tcp.maxr2t;
1453 if (controller)
1454 qp->maxh2cdata = params->tcp.maxh2cdata;
1455 qp->max_tx_data = tcp_max_transmit_data;
1456 if (!controller) {
1457 qp->max_tx_data = min(qp->max_tx_data, params->tcp.maxh2cdata);
1458 qp->max_icd = params->tcp.max_icd;
1459 }
1460
1461 if (controller) {
1462 /* Use the SUCCESS flag if SQ flow control is disabled. */
1463 qp->send_success = !params->sq_flow_control;
1464
1465 /* NB: maxr2t is 0's based. */
1466 qp->num_ttags = MIN((u_int)UINT16_MAX + 1,
1467 (uint64_t)params->qsize * ((uint64_t)qp->maxr2t + 1));
1468 qp->open_ttags = mallocarray(qp->num_ttags,
1469 sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO);
1470 }
1471
1472 TAILQ_INIT(&qp->rx_buffers.head);
1473 TAILQ_INIT(&qp->tx_buffers.head);
1474 mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF);
1475 mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF);
1476
1477 cv_init(&qp->rx_cv, "-");
1478 cv_init(&qp->tx_cv, "-");
1479 mbufq_init(&qp->tx_pdus, 0);
1480 STAILQ_INIT(&qp->tx_capsules);
1481
1482 /* Register socket upcalls. */
1483 SOCKBUF_LOCK(&so->so_rcv);
1484 soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp);
1485 SOCKBUF_UNLOCK(&so->so_rcv);
1486 SOCKBUF_LOCK(&so->so_snd);
1487 soupcall_set(so, SO_SND, nvmf_soupcall_send, qp);
1488 SOCKBUF_UNLOCK(&so->so_snd);
1489
1490 /* Spin up kthreads. */
1491 error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0,
1492 "nvmef tcp rx");
1493 if (error != 0) {
1494 tcp_free_qpair(&qp->qp);
1495 return (NULL);
1496 }
1497 error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0,
1498 "nvmef tcp tx");
1499 if (error != 0) {
1500 tcp_free_qpair(&qp->qp);
1501 return (NULL);
1502 }
1503
1504 return (&qp->qp);
1505 }
1506
1507 static void
tcp_release_qpair(struct nvmf_tcp_qpair * qp)1508 tcp_release_qpair(struct nvmf_tcp_qpair *qp)
1509 {
1510 if (refcount_release(&qp->refs))
1511 free(qp, M_NVMF_TCP);
1512 }
1513
1514 static void
tcp_free_qpair(struct nvmf_qpair * nq)1515 tcp_free_qpair(struct nvmf_qpair *nq)
1516 {
1517 struct nvmf_tcp_qpair *qp = TQP(nq);
1518 struct nvmf_tcp_command_buffer *ncb, *cb;
1519 struct nvmf_tcp_capsule *ntc, *tc;
1520 struct socket *so = qp->so;
1521
1522 /* Shut down kthreads and clear upcalls */
1523 SOCKBUF_LOCK(&so->so_snd);
1524 qp->tx_shutdown = true;
1525 if (qp->tx_thread != NULL) {
1526 cv_signal(&qp->tx_cv);
1527 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
1528 "nvtcptx", 0);
1529 }
1530 soupcall_clear(so, SO_SND);
1531 SOCKBUF_UNLOCK(&so->so_snd);
1532
1533 SOCKBUF_LOCK(&so->so_rcv);
1534 qp->rx_shutdown = true;
1535 if (qp->rx_thread != NULL) {
1536 cv_signal(&qp->rx_cv);
1537 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
1538 "nvtcprx", 0);
1539 }
1540 soupcall_clear(so, SO_RCV);
1541 SOCKBUF_UNLOCK(&so->so_rcv);
1542
1543 STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) {
1544 nvmf_abort_capsule_data(&tc->nc, ECONNABORTED);
1545 tcp_release_capsule(tc);
1546 }
1547 mbufq_drain(&qp->tx_pdus);
1548
1549 cv_destroy(&qp->tx_cv);
1550 cv_destroy(&qp->rx_cv);
1551
1552 if (qp->open_ttags != NULL) {
1553 for (u_int i = 0; i < qp->num_ttags; i++) {
1554 cb = qp->open_ttags[i];
1555 if (cb != NULL) {
1556 cb->tc->active_r2ts--;
1557 cb->error = ECONNABORTED;
1558 tcp_release_command_buffer(cb);
1559 }
1560 }
1561 free(qp->open_ttags, M_NVMF_TCP);
1562 }
1563
1564 mtx_lock(&qp->rx_buffers.lock);
1565 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
1566 tcp_remove_command_buffer(&qp->rx_buffers, cb);
1567 mtx_unlock(&qp->rx_buffers.lock);
1568 #ifdef INVARIANTS
1569 if (cb->tc != NULL)
1570 cb->tc->pending_r2ts--;
1571 #endif
1572 cb->error = ECONNABORTED;
1573 tcp_release_command_buffer(cb);
1574 mtx_lock(&qp->rx_buffers.lock);
1575 }
1576 mtx_destroy(&qp->rx_buffers.lock);
1577
1578 mtx_lock(&qp->tx_buffers.lock);
1579 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
1580 tcp_remove_command_buffer(&qp->tx_buffers, cb);
1581 mtx_unlock(&qp->tx_buffers.lock);
1582 cb->error = ECONNABORTED;
1583 tcp_release_command_buffer(cb);
1584 mtx_lock(&qp->tx_buffers.lock);
1585 }
1586 mtx_destroy(&qp->tx_buffers.lock);
1587
1588 soclose(so);
1589
1590 tcp_release_qpair(qp);
1591 }
1592
1593 static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * nq,int how)1594 tcp_allocate_capsule(struct nvmf_qpair *nq, int how)
1595 {
1596 struct nvmf_tcp_qpair *qp = TQP(nq);
1597 struct nvmf_tcp_capsule *tc;
1598
1599 tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO);
1600 if (tc == NULL)
1601 return (NULL);
1602 refcount_init(&tc->refs, 1);
1603 refcount_acquire(&qp->refs);
1604 return (&tc->nc);
1605 }
1606
1607 static void
tcp_release_capsule(struct nvmf_tcp_capsule * tc)1608 tcp_release_capsule(struct nvmf_tcp_capsule *tc)
1609 {
1610 struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair);
1611
1612 if (!refcount_release(&tc->refs))
1613 return;
1614
1615 MPASS(tc->active_r2ts == 0);
1616 MPASS(tc->pending_r2ts == 0);
1617
1618 nvmf_tcp_free_pdu(&tc->rx_pdu);
1619 free(tc, M_NVMF_TCP);
1620 tcp_release_qpair(qp);
1621 }
1622
1623 static void
tcp_free_capsule(struct nvmf_capsule * nc)1624 tcp_free_capsule(struct nvmf_capsule *nc)
1625 {
1626 struct nvmf_tcp_capsule *tc = TCAP(nc);
1627
1628 tcp_release_capsule(tc);
1629 }
1630
1631 static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1632 tcp_transmit_capsule(struct nvmf_capsule *nc)
1633 {
1634 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1635 struct nvmf_tcp_capsule *tc = TCAP(nc);
1636 struct socket *so = qp->so;
1637
1638 refcount_acquire(&tc->refs);
1639 SOCKBUF_LOCK(&so->so_snd);
1640 STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link);
1641 if (sowriteable(so))
1642 cv_signal(&qp->tx_cv);
1643 SOCKBUF_UNLOCK(&so->so_snd);
1644 return (0);
1645 }
1646
1647 static uint8_t
tcp_validate_command_capsule(struct nvmf_capsule * nc)1648 tcp_validate_command_capsule(struct nvmf_capsule *nc)
1649 {
1650 struct nvmf_tcp_capsule *tc = TCAP(nc);
1651 struct nvme_sgl_descriptor *sgl;
1652
1653 KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
1654
1655 sgl = &nc->nc_sqe.sgl;
1656 switch (sgl->type) {
1657 case NVME_SGL_TYPE_ICD:
1658 if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1659 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1660 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1661 }
1662 break;
1663 case NVME_SGL_TYPE_COMMAND_BUFFER:
1664 if (tc->rx_pdu.data_len != 0) {
1665 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1666 return (NVME_SC_INVALID_FIELD);
1667 }
1668 break;
1669 default:
1670 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1671 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1672 }
1673
1674 if (sgl->address != 0) {
1675 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1676 return (NVME_SC_SGL_OFFSET_INVALID);
1677 }
1678
1679 return (NVME_SC_SUCCESS);
1680 }
1681
1682 static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1683 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1684 {
1685 MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
1686 return (le32toh(nc->nc_sqe.sgl.length));
1687 }
1688
1689 static void
tcp_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1690 tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
1691 struct nvmf_io_request *io)
1692 {
1693 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1694 struct nvmf_tcp_capsule *tc = TCAP(nc);
1695 struct nvmf_tcp_command_buffer *cb;
1696
1697 cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len,
1698 nc->nc_sqe.cid);
1699
1700 cb->tc = tc;
1701 refcount_acquire(&tc->refs);
1702
1703 /*
1704 * If this command has too many active R2Ts or there are no
1705 * available transfer tags, queue the request for later.
1706 *
1707 * NB: maxr2t is 0's based.
1708 */
1709 mtx_lock(&qp->rx_buffers.lock);
1710 if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) {
1711 #ifdef INVARIANTS
1712 tc->pending_r2ts++;
1713 #endif
1714 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
1715 mtx_unlock(&qp->rx_buffers.lock);
1716 return;
1717 }
1718
1719 nvmf_tcp_allocate_ttag(qp, cb);
1720 mtx_unlock(&qp->rx_buffers.lock);
1721
1722 tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
1723 }
1724
1725 static void
tcp_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1726 tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
1727 struct nvmf_io_request *io)
1728 {
1729 struct nvmf_tcp_capsule *tc = TCAP(nc);
1730
1731 mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset,
1732 io->io_len, io, 0);
1733 nvmf_complete_io_request(io, io->io_len, 0);
1734 }
1735
1736 static int
tcp_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1737 tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1738 struct nvmf_io_request *io)
1739 {
1740 struct nvme_sgl_descriptor *sgl;
1741 size_t data_len;
1742
1743 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1744 !nc->nc_qpair->nq_controller)
1745 return (EINVAL);
1746
1747 sgl = &nc->nc_sqe.sgl;
1748 data_len = le32toh(sgl->length);
1749 if (data_offset + io->io_len > data_len)
1750 return (EFBIG);
1751
1752 if (sgl->type == NVME_SGL_TYPE_ICD)
1753 tcp_receive_icd_data(nc, data_offset, io);
1754 else
1755 tcp_receive_r2t_data(nc, data_offset, io);
1756 return (0);
1757 }
1758
1759 /* NB: cid is little-endian already. */
1760 static void
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)1761 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset,
1762 struct mbuf *m, size_t len, bool last_pdu, bool success)
1763 {
1764 struct nvme_tcp_c2h_data_hdr c2h;
1765 struct mbuf *top;
1766
1767 memset(&c2h, 0, sizeof(c2h));
1768 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1769 if (last_pdu)
1770 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1771 if (success)
1772 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1773 c2h.cccid = cid;
1774 c2h.datao = htole32(data_offset);
1775 c2h.datal = htole32(len);
1776
1777 top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
1778 nvmf_tcp_write_pdu(qp, top);
1779 }
1780
1781 static u_int
tcp_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)1782 tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1783 struct mbuf *m, size_t len)
1784 {
1785 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1786 struct nvme_sgl_descriptor *sgl;
1787 uint32_t data_len;
1788 bool last_pdu, last_xfer;
1789
1790 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1791 !qp->qp.nq_controller) {
1792 m_freem(m);
1793 return (NVME_SC_INVALID_FIELD);
1794 }
1795
1796 sgl = &nc->nc_sqe.sgl;
1797 data_len = le32toh(sgl->length);
1798 if (data_offset + len > data_len) {
1799 m_freem(m);
1800 return (NVME_SC_INVALID_FIELD);
1801 }
1802 last_xfer = (data_offset + len == data_len);
1803
1804 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1805 m_freem(m);
1806 return (NVME_SC_INVALID_FIELD);
1807 }
1808
1809 KASSERT(data_offset == TCAP(nc)->tx_data_offset,
1810 ("%s: starting data_offset %u doesn't match end of previous xfer %u",
1811 __func__, data_offset, TCAP(nc)->tx_data_offset));
1812
1813 /* Queue one more C2H_DATA PDUs containing the data from 'm'. */
1814 while (m != NULL) {
1815 struct mbuf *n;
1816 uint32_t todo;
1817
1818 if (m->m_len > qp->max_tx_data) {
1819 n = m_split(m, qp->max_tx_data, M_WAITOK);
1820 todo = m->m_len;
1821 } else {
1822 struct mbuf *p;
1823
1824 todo = m->m_len;
1825 p = m;
1826 n = p->m_next;
1827 while (n != NULL) {
1828 if (todo + n->m_len > qp->max_tx_data) {
1829 p->m_next = NULL;
1830 break;
1831 }
1832 todo += n->m_len;
1833 p = n;
1834 n = p->m_next;
1835 }
1836 MPASS(m_length(m, NULL) == todo);
1837 }
1838
1839 last_pdu = (n == NULL && last_xfer);
1840 tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
1841 last_pdu, last_pdu && qp->send_success);
1842
1843 data_offset += todo;
1844 data_len -= todo;
1845 m = n;
1846 }
1847 MPASS(data_len == 0);
1848
1849 #ifdef INVARIANTS
1850 TCAP(nc)->tx_data_offset = data_offset;
1851 #endif
1852 if (!last_xfer)
1853 return (NVMF_MORE);
1854 else if (qp->send_success)
1855 return (NVMF_SUCCESS_SENT);
1856 else
1857 return (NVME_SC_SUCCESS);
1858 }
1859
1860 struct nvmf_transport_ops tcp_ops = {
1861 .allocate_qpair = tcp_allocate_qpair,
1862 .free_qpair = tcp_free_qpair,
1863 .allocate_capsule = tcp_allocate_capsule,
1864 .free_capsule = tcp_free_capsule,
1865 .transmit_capsule = tcp_transmit_capsule,
1866 .validate_command_capsule = tcp_validate_command_capsule,
1867 .capsule_data_len = tcp_capsule_data_len,
1868 .receive_controller_data = tcp_receive_controller_data,
1869 .send_controller_data = tcp_send_controller_data,
1870 .trtype = NVMF_TRTYPE_TCP,
1871 .priority = 0,
1872 };
1873
1874 NVMF_TRANSPORT(tcp, tcp_ops);
1875