1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/capsicum.h>
10 #include <sys/condvar.h>
11 #include <sys/file.h>
12 #include <sys/gsb_crc32.h>
13 #include <sys/kernel.h>
14 #include <sys/kthread.h>
15 #include <sys/limits.h>
16 #include <sys/lock.h>
17 #include <sys/malloc.h>
18 #include <sys/mbuf.h>
19 #include <sys/module.h>
20 #include <sys/mutex.h>
21 #include <sys/nv.h>
22 #include <sys/protosw.h>
23 #include <sys/refcount.h>
24 #include <sys/socket.h>
25 #include <sys/socketvar.h>
26 #include <sys/sysctl.h>
27 #include <sys/uio.h>
28 #include <netinet/in.h>
29 #include <dev/nvme/nvme.h>
30 #include <dev/nvmf/nvmf.h>
31 #include <dev/nvmf/nvmf_proto.h>
32 #include <dev/nvmf/nvmf_tcp.h>
33 #include <dev/nvmf/nvmf_transport.h>
34 #include <dev/nvmf/nvmf_transport_internal.h>
35
36 struct nvmf_tcp_capsule;
37 struct nvmf_tcp_qpair;
38
39 struct nvmf_tcp_command_buffer {
40 struct nvmf_tcp_qpair *qp;
41
42 struct nvmf_io_request io;
43 size_t data_len;
44 size_t data_xfered;
45 uint32_t data_offset;
46
47 u_int refs;
48 int error;
49
50 uint16_t cid;
51 uint16_t ttag;
52
53 TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
54
55 /* Controller only */
56 struct nvmf_tcp_capsule *tc;
57 };
58
59 struct nvmf_tcp_command_buffer_list {
60 TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
61 struct mtx lock;
62 };
63
64 struct nvmf_tcp_qpair {
65 struct nvmf_qpair qp;
66
67 struct socket *so;
68
69 volatile u_int refs; /* Every allocated capsule holds a reference */
70 uint8_t txpda;
71 uint8_t rxpda;
72 bool header_digests;
73 bool data_digests;
74 uint32_t maxr2t;
75 uint32_t maxh2cdata; /* Controller only */
76 uint32_t max_tx_data;
77 uint32_t max_icd; /* Host only */
78 uint16_t next_ttag; /* Controller only */
79 u_int num_ttags; /* Controller only */
80 u_int active_ttags; /* Controller only */
81 bool send_success; /* Controller only */
82
83 /* Receive state. */
84 struct thread *rx_thread;
85 struct cv rx_cv;
86 bool rx_shutdown;
87
88 /* Transmit state. */
89 struct thread *tx_thread;
90 struct cv tx_cv;
91 bool tx_shutdown;
92 struct mbufq tx_pdus;
93 STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
94
95 struct nvmf_tcp_command_buffer_list tx_buffers;
96 struct nvmf_tcp_command_buffer_list rx_buffers;
97
98 /*
99 * For the controller, an RX command buffer can be in one of
100 * two locations, all protected by the rx_buffers.lock. If a
101 * receive request is waiting for either an R2T slot for its
102 * command (due to exceeding MAXR2T), or a transfer tag it is
103 * placed on the rx_buffers list. When a request is allocated
104 * an active transfer tag, it moves to the open_ttags[] array
105 * (indexed by the tag) until it completes.
106 */
107 struct nvmf_tcp_command_buffer **open_ttags; /* Controller only */
108 };
109
110 struct nvmf_tcp_rxpdu {
111 struct mbuf *m;
112 const struct nvme_tcp_common_pdu_hdr *hdr;
113 uint32_t data_len;
114 bool data_digest_mismatch;
115 };
116
117 struct nvmf_tcp_capsule {
118 struct nvmf_capsule nc;
119
120 volatile u_int refs;
121
122 struct nvmf_tcp_rxpdu rx_pdu;
123
124 uint32_t active_r2ts; /* Controller only */
125 #ifdef INVARIANTS
126 uint32_t tx_data_offset; /* Controller only */
127 u_int pending_r2ts; /* Controller only */
128 #endif
129
130 STAILQ_ENTRY(nvmf_tcp_capsule) link;
131 };
132
133 #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc))
134 #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp))
135
136 static void tcp_release_capsule(struct nvmf_tcp_capsule *tc);
137 static void tcp_free_qpair(struct nvmf_qpair *nq);
138
139 SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
140 "TCP transport");
141 static u_int tcp_max_transmit_data = 256 * 1024;
142 SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN,
143 &tcp_max_transmit_data, 0,
144 "Maximum size of data payload in a transmitted PDU");
145
146 static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
147
148 static int
mbuf_crc32c_helper(void * arg,void * data,u_int len)149 mbuf_crc32c_helper(void *arg, void *data, u_int len)
150 {
151 uint32_t *digestp = arg;
152
153 *digestp = calculate_crc32c(*digestp, data, len);
154 return (0);
155 }
156
157 static uint32_t
mbuf_crc32c(struct mbuf * m,u_int offset,u_int len)158 mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
159 {
160 uint32_t digest = 0xffffffff;
161
162 m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
163 digest = digest ^ 0xffffffff;
164
165 return (digest);
166 }
167
168 static uint32_t
compute_digest(const void * buf,size_t len)169 compute_digest(const void *buf, size_t len)
170 {
171 return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
172 }
173
174 static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)175 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
176 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
177 uint16_t cid)
178 {
179 struct nvmf_tcp_command_buffer *cb;
180
181 cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
182 cb->qp = qp;
183 cb->io = *io;
184 cb->data_offset = data_offset;
185 cb->data_len = data_len;
186 cb->data_xfered = 0;
187 refcount_init(&cb->refs, 1);
188 cb->error = 0;
189 cb->cid = cid;
190 cb->ttag = 0;
191 cb->tc = NULL;
192
193 return (cb);
194 }
195
196 static void
tcp_hold_command_buffer(struct nvmf_tcp_command_buffer * cb)197 tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
198 {
199 refcount_acquire(&cb->refs);
200 }
201
202 static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)203 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
204 {
205 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
206 if (cb->tc != NULL)
207 tcp_release_capsule(cb->tc);
208 free(cb, M_NVMF_TCP);
209 }
210
211 static void
tcp_release_command_buffer(struct nvmf_tcp_command_buffer * cb)212 tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
213 {
214 if (refcount_release(&cb->refs))
215 tcp_free_command_buffer(cb);
216 }
217
218 static void
tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list * list,struct nvmf_tcp_command_buffer * cb)219 tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
220 struct nvmf_tcp_command_buffer *cb)
221 {
222 mtx_assert(&list->lock, MA_OWNED);
223 TAILQ_INSERT_HEAD(&list->head, cb, link);
224 }
225
226 static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list * list,uint16_t cid,uint16_t ttag)227 tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
228 uint16_t cid, uint16_t ttag)
229 {
230 struct nvmf_tcp_command_buffer *cb;
231
232 mtx_assert(&list->lock, MA_OWNED);
233 TAILQ_FOREACH(cb, &list->head, link) {
234 if (cb->cid == cid && cb->ttag == ttag)
235 return (cb);
236 }
237 return (NULL);
238 }
239
240 static void
tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list * list,struct nvmf_tcp_command_buffer * cb)241 tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
242 struct nvmf_tcp_command_buffer *cb)
243 {
244 mtx_assert(&list->lock, MA_OWNED);
245 TAILQ_REMOVE(&list->head, cb, link);
246 }
247
248 static void
tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list * list,uint16_t cid,uint16_t ttag)249 tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
250 uint16_t cid, uint16_t ttag)
251 {
252 struct nvmf_tcp_command_buffer *cb;
253
254 mtx_lock(&list->lock);
255 cb = tcp_find_command_buffer(list, cid, ttag);
256 if (cb != NULL) {
257 tcp_remove_command_buffer(list, cb);
258 mtx_unlock(&list->lock);
259 tcp_release_command_buffer(cb);
260 } else
261 mtx_unlock(&list->lock);
262 }
263
264 static void
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,struct mbuf * m)265 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m)
266 {
267 struct socket *so = qp->so;
268
269 SOCKBUF_LOCK(&so->so_snd);
270 mbufq_enqueue(&qp->tx_pdus, m);
271 /* XXX: Do we need to handle sb_hiwat being wrong? */
272 if (sowriteable(so))
273 cv_signal(&qp->tx_cv);
274 SOCKBUF_UNLOCK(&so->so_snd);
275 }
276
277 static void
nvmf_tcp_report_error(struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)278 nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
279 struct mbuf *rx_pdu, u_int hlen)
280 {
281 struct nvme_tcp_term_req_hdr *hdr;
282 struct mbuf *m;
283
284 if (hlen != 0) {
285 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
286 hlen = min(hlen, m_length(rx_pdu, NULL));
287 }
288
289 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
290 m->m_len = sizeof(*hdr) + hlen;
291 hdr = mtod(m, void *);
292 memset(hdr, 0, sizeof(*hdr));
293 hdr->common.pdu_type = qp->qp.nq_controller ?
294 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
295 hdr->common.hlen = sizeof(*hdr);
296 hdr->common.plen = sizeof(*hdr) + hlen;
297 hdr->fes = htole16(fes);
298 le32enc(hdr->fei, fei);
299 if (hlen != 0)
300 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
301
302 nvmf_tcp_write_pdu(qp, m);
303 }
304
305 static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)306 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
307 {
308 const struct nvme_tcp_common_pdu_hdr *ch;
309 struct mbuf *m = pdu->m;
310 uint32_t data_len, fei, plen;
311 uint32_t digest, rx_digest;
312 u_int hlen;
313 int error;
314 uint16_t fes;
315
316 /* Determine how large of a PDU header to return for errors. */
317 ch = pdu->hdr;
318 hlen = ch->hlen;
319 plen = le32toh(ch->plen);
320 if (hlen < sizeof(*ch) || hlen > plen)
321 hlen = sizeof(*ch);
322
323 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
324 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
325 &fei);
326 if (error != 0) {
327 if (error != ECONNRESET)
328 nvmf_tcp_report_error(qp, fes, fei, m, hlen);
329 return (error);
330 }
331
332 /* Check header digest if present. */
333 if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
334 digest = mbuf_crc32c(m, 0, ch->hlen);
335 m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
336 if (digest != rx_digest) {
337 printf("NVMe/TCP: Header digest mismatch\n");
338 nvmf_tcp_report_error(qp,
339 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
340 hlen);
341 return (EBADMSG);
342 }
343 }
344
345 /* Check data digest if present. */
346 pdu->data_digest_mismatch = false;
347 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
348 digest = mbuf_crc32c(m, ch->pdo, data_len);
349 m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
350 (caddr_t)&rx_digest);
351 if (digest != rx_digest) {
352 printf("NVMe/TCP: Data digest mismatch\n");
353 pdu->data_digest_mismatch = true;
354 }
355 }
356
357 pdu->data_len = data_len;
358 return (0);
359 }
360
361 static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)362 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
363 {
364 m_freem(pdu->m);
365 pdu->m = NULL;
366 pdu->hdr = NULL;
367 }
368
369 static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)370 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
371 {
372 const struct nvme_tcp_term_req_hdr *hdr;
373
374 hdr = (const void *)pdu->hdr;
375
376 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
377 le16toh(hdr->fes), le32dec(hdr->fei));
378 nvmf_tcp_free_pdu(pdu);
379 return (ECONNRESET);
380 }
381
382 static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)383 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
384 struct nvmf_tcp_rxpdu *pdu)
385 {
386 const struct nvme_tcp_cmd *cmd;
387 struct nvmf_capsule *nc;
388 struct nvmf_tcp_capsule *tc;
389
390 cmd = (const void *)pdu->hdr;
391
392 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
393
394 tc = TCAP(nc);
395 tc->rx_pdu = *pdu;
396
397 nvmf_capsule_received(&qp->qp, nc);
398 return (0);
399 }
400
401 static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)402 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
403 struct nvmf_tcp_rxpdu *pdu)
404 {
405 const struct nvme_tcp_rsp *rsp;
406 struct nvmf_capsule *nc;
407 struct nvmf_tcp_capsule *tc;
408
409 rsp = (const void *)pdu->hdr;
410
411 nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
412
413 nc->nc_sqhd_valid = true;
414 tc = TCAP(nc);
415 tc->rx_pdu = *pdu;
416
417 /*
418 * Once the CQE has been received, no further transfers to the
419 * command buffer for the associated CID can occur.
420 */
421 tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
422 tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
423
424 nvmf_capsule_received(&qp->qp, nc);
425 return (0);
426 }
427
428 /*
429 * Construct a PDU that contains an optional data payload. This
430 * includes dealing with digests and the length fields in the common
431 * header.
432 */
433 static struct mbuf *
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)434 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
435 struct mbuf *data, uint32_t data_len)
436 {
437 struct nvme_tcp_common_pdu_hdr *ch;
438 struct mbuf *top;
439 uint32_t digest, pad, pdo, plen, mlen;
440
441 plen = hlen;
442 if (qp->header_digests)
443 plen += sizeof(digest);
444 if (data_len != 0) {
445 KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
446 pdo = roundup(plen, qp->txpda);
447 pad = pdo - plen;
448 plen = pdo + data_len;
449 if (qp->data_digests)
450 plen += sizeof(digest);
451 mlen = pdo;
452 } else {
453 KASSERT(data == NULL, ("payload mbuf with zero length"));
454 pdo = 0;
455 pad = 0;
456 mlen = plen;
457 }
458
459 top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
460 top->m_len = mlen;
461 ch = mtod(top, void *);
462 memcpy(ch, hdr, hlen);
463 ch->hlen = hlen;
464 if (qp->header_digests)
465 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
466 if (qp->data_digests && data_len != 0)
467 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
468 ch->pdo = pdo;
469 ch->plen = htole32(plen);
470
471 /* HDGST */
472 if (qp->header_digests) {
473 digest = compute_digest(ch, hlen);
474 memcpy((char *)ch + hlen, &digest, sizeof(digest));
475 }
476
477 if (pad != 0) {
478 /* PAD */
479 memset((char *)ch + pdo - pad, 0, pad);
480 }
481
482 if (data_len != 0) {
483 /* DATA */
484 top->m_next = data;
485
486 /* DDGST */
487 if (qp->data_digests) {
488 digest = mbuf_crc32c(data, 0, data_len);
489
490 /* XXX: Can't use m_append as it uses M_NOWAIT. */
491 while (data->m_next != NULL)
492 data = data->m_next;
493
494 data->m_next = m_get(M_WAITOK, MT_DATA);
495 data->m_next->m_len = sizeof(digest);
496 memcpy(mtod(data->m_next, void *), &digest,
497 sizeof(digest));
498 }
499 }
500
501 return (top);
502 }
503
504 /* Find the next command buffer eligible to schedule for R2T. */
505 static struct nvmf_tcp_command_buffer *
nvmf_tcp_next_r2t(struct nvmf_tcp_qpair * qp)506 nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
507 {
508 struct nvmf_tcp_command_buffer *cb;
509
510 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
511 MPASS(qp->active_ttags < qp->num_ttags);
512
513 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
514 /* NB: maxr2t is 0's based. */
515 if (cb->tc->active_r2ts > qp->maxr2t)
516 continue;
517 #ifdef INVARIANTS
518 cb->tc->pending_r2ts--;
519 #endif
520 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
521 return (cb);
522 }
523 return (NULL);
524 }
525
526 /* Allocate the next free transfer tag and assign it to cb. */
527 static void
nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_command_buffer * cb)528 nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
529 struct nvmf_tcp_command_buffer *cb)
530 {
531 uint16_t ttag;
532
533 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
534
535 ttag = qp->next_ttag;
536 for (;;) {
537 if (qp->open_ttags[ttag] == NULL)
538 break;
539 if (ttag == qp->num_ttags - 1)
540 ttag = 0;
541 else
542 ttag++;
543 MPASS(ttag != qp->next_ttag);
544 }
545 if (ttag == qp->num_ttags - 1)
546 qp->next_ttag = 0;
547 else
548 qp->next_ttag = ttag + 1;
549
550 cb->tc->active_r2ts++;
551 qp->active_ttags++;
552 qp->open_ttags[ttag] = cb;
553
554 /*
555 * Don't bother byte-swapping ttag as it is just a cookie
556 * value returned by the other end as-is.
557 */
558 cb->ttag = ttag;
559 }
560
561 /* NB: cid and ttag are both little-endian already. */
562 static void
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)563 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
564 uint32_t data_offset, uint32_t data_len)
565 {
566 struct nvme_tcp_r2t_hdr r2t;
567 struct mbuf *m;
568
569 memset(&r2t, 0, sizeof(r2t));
570 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
571 r2t.cccid = cid;
572 r2t.ttag = ttag;
573 r2t.r2to = htole32(data_offset);
574 r2t.r2tl = htole32(data_len);
575
576 m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
577 nvmf_tcp_write_pdu(qp, m);
578 }
579
580 /*
581 * Release a transfer tag and schedule another R2T.
582 *
583 * NB: This drops the rx_buffers.lock mutex.
584 */
585 static void
nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_command_buffer * cb)586 nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
587 struct nvmf_tcp_command_buffer *cb)
588 {
589 struct nvmf_tcp_command_buffer *ncb;
590
591 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
592 MPASS(qp->open_ttags[cb->ttag] == cb);
593
594 /* Release this transfer tag. */
595 qp->open_ttags[cb->ttag] = NULL;
596 qp->active_ttags--;
597 cb->tc->active_r2ts--;
598
599 /* Schedule another R2T. */
600 ncb = nvmf_tcp_next_r2t(qp);
601 if (ncb != NULL) {
602 nvmf_tcp_allocate_ttag(qp, ncb);
603 mtx_unlock(&qp->rx_buffers.lock);
604 tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
605 ncb->data_len);
606 } else
607 mtx_unlock(&qp->rx_buffers.lock);
608 }
609
610 /*
611 * Copy len bytes starting at offset skip from an mbuf chain into an
612 * I/O buffer at destination offset io_offset.
613 */
614 static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)615 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
616 struct nvmf_io_request *io, u_int io_offset)
617 {
618 u_int todo;
619
620 while (m->m_len <= skip) {
621 skip -= m->m_len;
622 m = m->m_next;
623 }
624 while (len != 0) {
625 MPASS((m->m_flags & M_EXTPG) == 0);
626
627 todo = min(m->m_len - skip, len);
628 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
629 skip = 0;
630 io_offset += todo;
631 len -= todo;
632 m = m->m_next;
633 }
634 }
635
636 static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)637 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
638 {
639 const struct nvme_tcp_h2c_data_hdr *h2c;
640 struct nvmf_tcp_command_buffer *cb;
641 uint32_t data_len, data_offset;
642 uint16_t ttag;
643
644 h2c = (const void *)pdu->hdr;
645 if (le32toh(h2c->datal) > qp->maxh2cdata) {
646 nvmf_tcp_report_error(qp,
647 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
648 pdu->m, pdu->hdr->hlen);
649 nvmf_tcp_free_pdu(pdu);
650 return (EBADMSG);
651 }
652
653 /*
654 * NB: Don't bother byte-swapping ttag as we don't byte-swap
655 * it when sending.
656 */
657 ttag = h2c->ttag;
658 if (ttag >= qp->num_ttags) {
659 nvmf_tcp_report_error(qp,
660 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
661 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
662 pdu->hdr->hlen);
663 nvmf_tcp_free_pdu(pdu);
664 return (EBADMSG);
665 }
666
667 mtx_lock(&qp->rx_buffers.lock);
668 cb = qp->open_ttags[ttag];
669 if (cb == NULL) {
670 mtx_unlock(&qp->rx_buffers.lock);
671 nvmf_tcp_report_error(qp,
672 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
673 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
674 pdu->hdr->hlen);
675 nvmf_tcp_free_pdu(pdu);
676 return (EBADMSG);
677 }
678 MPASS(cb->ttag == ttag);
679
680 /* For a data digest mismatch, fail the I/O request. */
681 if (pdu->data_digest_mismatch) {
682 nvmf_tcp_send_next_r2t(qp, cb);
683 cb->error = EINTEGRITY;
684 tcp_release_command_buffer(cb);
685 nvmf_tcp_free_pdu(pdu);
686 return (0);
687 }
688
689 data_len = le32toh(h2c->datal);
690 if (data_len != pdu->data_len) {
691 mtx_unlock(&qp->rx_buffers.lock);
692 nvmf_tcp_report_error(qp,
693 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
694 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
695 pdu->hdr->hlen);
696 nvmf_tcp_free_pdu(pdu);
697 return (EBADMSG);
698 }
699
700 data_offset = le32toh(h2c->datao);
701 if (data_offset < cb->data_offset ||
702 data_offset + data_len > cb->data_offset + cb->data_len) {
703 mtx_unlock(&qp->rx_buffers.lock);
704 nvmf_tcp_report_error(qp,
705 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
706 pdu->hdr->hlen);
707 nvmf_tcp_free_pdu(pdu);
708 return (EBADMSG);
709 }
710
711 if (data_offset != cb->data_offset + cb->data_xfered) {
712 mtx_unlock(&qp->rx_buffers.lock);
713 nvmf_tcp_report_error(qp,
714 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
715 pdu->hdr->hlen);
716 nvmf_tcp_free_pdu(pdu);
717 return (EBADMSG);
718 }
719
720 if ((cb->data_xfered + data_len == cb->data_len) !=
721 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
722 mtx_unlock(&qp->rx_buffers.lock);
723 nvmf_tcp_report_error(qp,
724 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
725 pdu->hdr->hlen);
726 nvmf_tcp_free_pdu(pdu);
727 return (EBADMSG);
728 }
729
730 cb->data_xfered += data_len;
731 data_offset -= cb->data_offset;
732 if (cb->data_xfered == cb->data_len) {
733 nvmf_tcp_send_next_r2t(qp, cb);
734 } else {
735 tcp_hold_command_buffer(cb);
736 mtx_unlock(&qp->rx_buffers.lock);
737 }
738
739 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
740
741 tcp_release_command_buffer(cb);
742 nvmf_tcp_free_pdu(pdu);
743 return (0);
744 }
745
746 static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)747 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
748 {
749 const struct nvme_tcp_c2h_data_hdr *c2h;
750 struct nvmf_tcp_command_buffer *cb;
751 uint32_t data_len, data_offset;
752
753 c2h = (const void *)pdu->hdr;
754
755 mtx_lock(&qp->rx_buffers.lock);
756 cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
757 if (cb == NULL) {
758 mtx_unlock(&qp->rx_buffers.lock);
759 /*
760 * XXX: Could be PDU sequence error if cccid is for a
761 * command that doesn't use a command buffer.
762 */
763 nvmf_tcp_report_error(qp,
764 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
765 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
766 pdu->hdr->hlen);
767 nvmf_tcp_free_pdu(pdu);
768 return (EBADMSG);
769 }
770
771 /* For a data digest mismatch, fail the I/O request. */
772 if (pdu->data_digest_mismatch) {
773 cb->error = EINTEGRITY;
774 tcp_remove_command_buffer(&qp->rx_buffers, cb);
775 mtx_unlock(&qp->rx_buffers.lock);
776 tcp_release_command_buffer(cb);
777 nvmf_tcp_free_pdu(pdu);
778 return (0);
779 }
780
781 data_len = le32toh(c2h->datal);
782 if (data_len != pdu->data_len) {
783 mtx_unlock(&qp->rx_buffers.lock);
784 nvmf_tcp_report_error(qp,
785 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
786 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
787 pdu->hdr->hlen);
788 nvmf_tcp_free_pdu(pdu);
789 return (EBADMSG);
790 }
791
792 data_offset = le32toh(c2h->datao);
793 if (data_offset < cb->data_offset ||
794 data_offset + data_len > cb->data_offset + cb->data_len) {
795 mtx_unlock(&qp->rx_buffers.lock);
796 nvmf_tcp_report_error(qp,
797 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
798 pdu->m, pdu->hdr->hlen);
799 nvmf_tcp_free_pdu(pdu);
800 return (EBADMSG);
801 }
802
803 if (data_offset != cb->data_offset + cb->data_xfered) {
804 mtx_unlock(&qp->rx_buffers.lock);
805 nvmf_tcp_report_error(qp,
806 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
807 pdu->hdr->hlen);
808 nvmf_tcp_free_pdu(pdu);
809 return (EBADMSG);
810 }
811
812 if ((cb->data_xfered + data_len == cb->data_len) !=
813 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
814 mtx_unlock(&qp->rx_buffers.lock);
815 nvmf_tcp_report_error(qp,
816 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
817 pdu->hdr->hlen);
818 nvmf_tcp_free_pdu(pdu);
819 return (EBADMSG);
820 }
821
822 cb->data_xfered += data_len;
823 data_offset -= cb->data_offset;
824 if (cb->data_xfered == cb->data_len)
825 tcp_remove_command_buffer(&qp->rx_buffers, cb);
826 else
827 tcp_hold_command_buffer(cb);
828 mtx_unlock(&qp->rx_buffers.lock);
829
830 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
831
832 tcp_release_command_buffer(cb);
833
834 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
835 struct nvme_completion cqe;
836 struct nvmf_capsule *nc;
837
838 memset(&cqe, 0, sizeof(cqe));
839 cqe.cid = c2h->cccid;
840
841 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
842 nc->nc_sqhd_valid = false;
843
844 nvmf_capsule_received(&qp->qp, nc);
845 }
846
847 nvmf_tcp_free_pdu(pdu);
848 return (0);
849 }
850
851 /* Called when m_free drops refcount to 0. */
852 static void
nvmf_tcp_mbuf_done(struct mbuf * m)853 nvmf_tcp_mbuf_done(struct mbuf *m)
854 {
855 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
856
857 tcp_free_command_buffer(cb);
858 }
859
860 static struct mbuf *
nvmf_tcp_mbuf(void * arg,int how,void * data,size_t len)861 nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len)
862 {
863 struct nvmf_tcp_command_buffer *cb = arg;
864 struct mbuf *m;
865
866 m = m_get(how, MT_DATA);
867 m->m_flags |= M_RDONLY;
868 m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL);
869 m->m_len = len;
870 return (m);
871 }
872
873 static void
nvmf_tcp_free_mext_pg(struct mbuf * m)874 nvmf_tcp_free_mext_pg(struct mbuf *m)
875 {
876 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
877
878 M_ASSERTEXTPG(m);
879 tcp_release_command_buffer(cb);
880 }
881
882 static struct mbuf *
nvmf_tcp_mext_pg(void * arg,int how)883 nvmf_tcp_mext_pg(void *arg, int how)
884 {
885 struct nvmf_tcp_command_buffer *cb = arg;
886 struct mbuf *m;
887
888 m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY);
889 m->m_ext.ext_arg1 = cb;
890 tcp_hold_command_buffer(cb);
891 return (m);
892 }
893
894 /*
895 * Return an mbuf chain for a range of data belonging to a command
896 * buffer.
897 *
898 * The mbuf chain uses M_EXT mbufs which hold references on the
899 * command buffer so that it remains "alive" until the data has been
900 * fully transmitted. If truncate_ok is true, then the mbuf chain
901 * might return a short chain to avoid gratuitously splitting up a
902 * page.
903 */
904 static struct mbuf *
nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)905 nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb,
906 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
907 bool can_truncate)
908 {
909 struct mbuf *m;
910 size_t len;
911
912 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf,
913 nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
914 can_truncate);
915 if (actual_len != NULL)
916 *actual_len = len;
917 return (m);
918 }
919
920 /* NB: cid and ttag and little-endian already. */
921 static void
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)922 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
923 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
924 {
925 struct nvme_tcp_h2c_data_hdr h2c;
926 struct mbuf *top;
927
928 memset(&h2c, 0, sizeof(h2c));
929 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
930 if (last_pdu)
931 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
932 h2c.cccid = cid;
933 h2c.ttag = ttag;
934 h2c.datao = htole32(data_offset);
935 h2c.datal = htole32(len);
936
937 top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
938 nvmf_tcp_write_pdu(qp, top);
939 }
940
941 static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)942 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
943 {
944 const struct nvme_tcp_r2t_hdr *r2t;
945 struct nvmf_tcp_command_buffer *cb;
946 uint32_t data_len, data_offset;
947
948 r2t = (const void *)pdu->hdr;
949
950 mtx_lock(&qp->tx_buffers.lock);
951 cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0);
952 if (cb == NULL) {
953 mtx_unlock(&qp->tx_buffers.lock);
954 nvmf_tcp_report_error(qp,
955 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
956 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
957 pdu->hdr->hlen);
958 nvmf_tcp_free_pdu(pdu);
959 return (EBADMSG);
960 }
961
962 data_offset = le32toh(r2t->r2to);
963 if (data_offset != cb->data_xfered) {
964 mtx_unlock(&qp->tx_buffers.lock);
965 nvmf_tcp_report_error(qp,
966 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
967 pdu->hdr->hlen);
968 nvmf_tcp_free_pdu(pdu);
969 return (EBADMSG);
970 }
971
972 /*
973 * XXX: The spec does not specify how to handle R2T tranfers
974 * out of range of the original command.
975 */
976 data_len = le32toh(r2t->r2tl);
977 if (data_offset + data_len > cb->data_len) {
978 mtx_unlock(&qp->tx_buffers.lock);
979 nvmf_tcp_report_error(qp,
980 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
981 pdu->m, pdu->hdr->hlen);
982 nvmf_tcp_free_pdu(pdu);
983 return (EBADMSG);
984 }
985
986 cb->data_xfered += data_len;
987 if (cb->data_xfered == cb->data_len)
988 tcp_remove_command_buffer(&qp->tx_buffers, cb);
989 else
990 tcp_hold_command_buffer(cb);
991 mtx_unlock(&qp->tx_buffers.lock);
992
993 /*
994 * Queue one or more H2C_DATA PDUs containing the requested
995 * data.
996 */
997 while (data_len > 0) {
998 struct mbuf *m;
999 uint32_t sent, todo;
1000
1001 todo = min(data_len, qp->max_tx_data);
1002 m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent,
1003 todo < data_len);
1004 tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1005 sent, sent == data_len);
1006
1007 data_offset += sent;
1008 data_len -= sent;
1009 }
1010
1011 tcp_release_command_buffer(cb);
1012 nvmf_tcp_free_pdu(pdu);
1013 return (0);
1014 }
1015
1016 /*
1017 * A variant of m_pullup that uses M_WAITOK instead of failing. It
1018 * also doesn't do anything if enough bytes are already present in the
1019 * first mbuf.
1020 */
1021 static struct mbuf *
pullup_pdu_hdr(struct mbuf * m,int len)1022 pullup_pdu_hdr(struct mbuf *m, int len)
1023 {
1024 struct mbuf *n, *p;
1025
1026 KASSERT(len <= MCLBYTES, ("%s: len too large", __func__));
1027 if (m->m_len >= len)
1028 return (m);
1029
1030 n = m_get2(len, M_WAITOK, MT_DATA, 0);
1031 n->m_len = len;
1032 m_copydata(m, 0, len, mtod(n, void *));
1033
1034 while (m != NULL && m->m_len <= len) {
1035 p = m->m_next;
1036 len -= m->m_len;
1037 m_free(m);
1038 m = p;
1039 }
1040 if (len > 0) {
1041 m->m_data += len;
1042 m->m_len -= len;
1043 }
1044 n->m_next = m;
1045 return (n);
1046 }
1047
1048 static int
nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,struct nvmf_tcp_rxpdu * pdu)1049 nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp,
1050 const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu)
1051 {
1052 /* Ensure the PDU header is contiguous. */
1053 pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen);
1054 pdu->hdr = mtod(pdu->m, const void *);
1055
1056 switch (ch->pdu_type) {
1057 default:
1058 __assert_unreachable();
1059 break;
1060 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1061 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1062 return (nvmf_tcp_handle_term_req(pdu));
1063 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1064 return (nvmf_tcp_save_command_capsule(qp, pdu));
1065 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1066 return (nvmf_tcp_save_response_capsule(qp, pdu));
1067 case NVME_TCP_PDU_TYPE_H2C_DATA:
1068 return (nvmf_tcp_handle_h2c_data(qp, pdu));
1069 case NVME_TCP_PDU_TYPE_C2H_DATA:
1070 return (nvmf_tcp_handle_c2h_data(qp, pdu));
1071 case NVME_TCP_PDU_TYPE_R2T:
1072 return (nvmf_tcp_handle_r2t(qp, pdu));
1073 }
1074 }
1075
1076 static void
nvmf_tcp_receive(void * arg)1077 nvmf_tcp_receive(void *arg)
1078 {
1079 struct nvmf_tcp_qpair *qp = arg;
1080 struct socket *so = qp->so;
1081 struct nvmf_tcp_rxpdu pdu;
1082 struct nvme_tcp_common_pdu_hdr ch;
1083 struct uio uio;
1084 struct iovec iov[1];
1085 struct mbuf *m, *n, *tail;
1086 u_int avail, needed;
1087 int error, flags, terror;
1088 bool have_header;
1089
1090 m = tail = NULL;
1091 have_header = false;
1092 SOCKBUF_LOCK(&so->so_rcv);
1093 while (!qp->rx_shutdown) {
1094 /* Wait until there is enough data for the next step. */
1095 if (so->so_error != 0 || so->so_rerror != 0) {
1096 if (so->so_error != 0)
1097 error = so->so_error;
1098 else
1099 error = so->so_rerror;
1100 SOCKBUF_UNLOCK(&so->so_rcv);
1101 error:
1102 m_freem(m);
1103 nvmf_qpair_error(&qp->qp, error);
1104 SOCKBUF_LOCK(&so->so_rcv);
1105 while (!qp->rx_shutdown)
1106 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1107 break;
1108 }
1109 avail = sbavail(&so->so_rcv);
1110 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1111 if (!have_header && avail == 0)
1112 error = 0;
1113 else
1114 error = ECONNRESET;
1115 SOCKBUF_UNLOCK(&so->so_rcv);
1116 goto error;
1117 }
1118 if (avail == 0 || (!have_header && avail < sizeof(ch))) {
1119 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1120 continue;
1121 }
1122 SOCKBUF_UNLOCK(&so->so_rcv);
1123
1124 if (!have_header) {
1125 KASSERT(m == NULL, ("%s: m != NULL but no header",
1126 __func__));
1127 memset(&uio, 0, sizeof(uio));
1128 iov[0].iov_base = &ch;
1129 iov[0].iov_len = sizeof(ch);
1130 uio.uio_iov = iov;
1131 uio.uio_iovcnt = 1;
1132 uio.uio_resid = sizeof(ch);
1133 uio.uio_segflg = UIO_SYSSPACE;
1134 uio.uio_rw = UIO_READ;
1135 flags = MSG_DONTWAIT | MSG_PEEK;
1136
1137 error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
1138 if (error != 0)
1139 goto error;
1140 KASSERT(uio.uio_resid == 0, ("%s: short CH read",
1141 __func__));
1142
1143 have_header = true;
1144 needed = le32toh(ch.plen);
1145
1146 /*
1147 * Malformed PDUs will be reported as errors
1148 * by nvmf_tcp_validate_pdu. Just pass along
1149 * garbage headers if the lengths mismatch.
1150 */
1151 if (needed < sizeof(ch) || ch.hlen > needed)
1152 needed = sizeof(ch);
1153
1154 memset(&uio, 0, sizeof(uio));
1155 uio.uio_resid = needed;
1156 }
1157
1158 flags = MSG_DONTWAIT;
1159 error = soreceive(so, NULL, &uio, &n, NULL, &flags);
1160 if (error != 0)
1161 goto error;
1162
1163 if (m == NULL)
1164 m = n;
1165 else
1166 tail->m_next = n;
1167
1168 if (uio.uio_resid != 0) {
1169 tail = n;
1170 while (tail->m_next != NULL)
1171 tail = tail->m_next;
1172
1173 SOCKBUF_LOCK(&so->so_rcv);
1174 continue;
1175 }
1176 #ifdef INVARIANTS
1177 tail = NULL;
1178 #endif
1179
1180 pdu.m = m;
1181 m = NULL;
1182 pdu.hdr = &ch;
1183 error = nvmf_tcp_validate_pdu(qp, &pdu);
1184 if (error != 0)
1185 m_freem(pdu.m);
1186 else
1187 error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu);
1188 if (error != 0) {
1189 /*
1190 * If we received a termination request, close
1191 * the connection immediately.
1192 */
1193 if (error == ECONNRESET)
1194 goto error;
1195
1196 /*
1197 * Wait for up to 30 seconds for the socket to
1198 * be closed by the other end.
1199 */
1200 SOCKBUF_LOCK(&so->so_rcv);
1201 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1202 terror = cv_timedwait(&qp->rx_cv,
1203 SOCKBUF_MTX(&so->so_rcv), 30 * hz);
1204 if (terror == ETIMEDOUT)
1205 printf("NVMe/TCP: Timed out after sending terminate request\n");
1206 }
1207 SOCKBUF_UNLOCK(&so->so_rcv);
1208 goto error;
1209 }
1210
1211 have_header = false;
1212 SOCKBUF_LOCK(&so->so_rcv);
1213 }
1214 SOCKBUF_UNLOCK(&so->so_rcv);
1215 kthread_exit();
1216 }
1217
1218 static struct mbuf *
tcp_command_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1219 tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1220 {
1221 struct nvmf_capsule *nc = &tc->nc;
1222 struct nvmf_tcp_command_buffer *cb;
1223 struct nvme_sgl_descriptor *sgl;
1224 struct nvme_tcp_cmd cmd;
1225 struct mbuf *top, *m;
1226 bool use_icd;
1227
1228 use_icd = false;
1229 cb = NULL;
1230 m = NULL;
1231
1232 if (nc->nc_data.io_len != 0) {
1233 cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0,
1234 nc->nc_data.io_len, nc->nc_sqe.cid);
1235
1236 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
1237 use_icd = true;
1238 m = nvmf_tcp_command_buffer_mbuf(cb, 0,
1239 nc->nc_data.io_len, NULL, false);
1240 cb->data_xfered = nc->nc_data.io_len;
1241 tcp_release_command_buffer(cb);
1242 } else if (nc->nc_send_data) {
1243 mtx_lock(&qp->tx_buffers.lock);
1244 tcp_add_command_buffer(&qp->tx_buffers, cb);
1245 mtx_unlock(&qp->tx_buffers.lock);
1246 } else {
1247 mtx_lock(&qp->rx_buffers.lock);
1248 tcp_add_command_buffer(&qp->rx_buffers, cb);
1249 mtx_unlock(&qp->rx_buffers.lock);
1250 }
1251 }
1252
1253 memset(&cmd, 0, sizeof(cmd));
1254 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1255 cmd.ccsqe = nc->nc_sqe;
1256
1257 /* Populate SGL in SQE. */
1258 sgl = &cmd.ccsqe.sgl;
1259 memset(sgl, 0, sizeof(*sgl));
1260 sgl->address = 0;
1261 sgl->length = htole32(nc->nc_data.io_len);
1262 if (use_icd) {
1263 /* Use in-capsule data. */
1264 sgl->type = NVME_SGL_TYPE_ICD;
1265 } else {
1266 /* Use a command buffer. */
1267 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1268 }
1269
1270 top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
1271 nc->nc_data.io_len : 0);
1272 return (top);
1273 }
1274
1275 static struct mbuf *
tcp_response_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1276 tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1277 {
1278 struct nvmf_capsule *nc = &tc->nc;
1279 struct nvme_tcp_rsp rsp;
1280
1281 memset(&rsp, 0, sizeof(rsp));
1282 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1283 rsp.rccqe = nc->nc_cqe;
1284
1285 return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1286 }
1287
1288 static struct mbuf *
capsule_to_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_capsule * tc)1289 capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
1290 {
1291 if (tc->nc.nc_qe_len == sizeof(struct nvme_command))
1292 return (tcp_command_pdu(qp, tc));
1293 else
1294 return (tcp_response_pdu(qp, tc));
1295 }
1296
1297 static void
nvmf_tcp_send(void * arg)1298 nvmf_tcp_send(void *arg)
1299 {
1300 struct nvmf_tcp_qpair *qp = arg;
1301 struct nvmf_tcp_capsule *tc;
1302 struct socket *so = qp->so;
1303 struct mbuf *m, *n, *p;
1304 u_long space, tosend;
1305 int error;
1306
1307 m = NULL;
1308 SOCKBUF_LOCK(&so->so_snd);
1309 while (!qp->tx_shutdown) {
1310 if (so->so_error != 0) {
1311 error = so->so_error;
1312 SOCKBUF_UNLOCK(&so->so_snd);
1313 error:
1314 m_freem(m);
1315 nvmf_qpair_error(&qp->qp, error);
1316 SOCKBUF_LOCK(&so->so_snd);
1317 while (!qp->tx_shutdown)
1318 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1319 break;
1320 }
1321
1322 if (m == NULL) {
1323 /* Next PDU to send. */
1324 m = mbufq_dequeue(&qp->tx_pdus);
1325 }
1326 if (m == NULL) {
1327 if (STAILQ_EMPTY(&qp->tx_capsules)) {
1328 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1329 continue;
1330 }
1331
1332 /* Convert a capsule into a PDU. */
1333 tc = STAILQ_FIRST(&qp->tx_capsules);
1334 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
1335 SOCKBUF_UNLOCK(&so->so_snd);
1336
1337 n = capsule_to_pdu(qp, tc);
1338 tcp_release_capsule(tc);
1339
1340 SOCKBUF_LOCK(&so->so_snd);
1341 mbufq_enqueue(&qp->tx_pdus, n);
1342 continue;
1343 }
1344
1345 /*
1346 * Wait until there is enough room to send some data.
1347 * If the socket buffer is empty, always send at least
1348 * something.
1349 */
1350 space = sbspace(&so->so_snd);
1351 if (space < m->m_len && sbused(&so->so_snd) != 0) {
1352 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
1353 continue;
1354 }
1355 SOCKBUF_UNLOCK(&so->so_snd);
1356
1357 /*
1358 * If 'm' is too big, then the socket buffer must be
1359 * empty. Split 'm' to make at least some forward
1360 * progress.
1361 *
1362 * Otherwise, chain up as many pending mbufs from 'm'
1363 * that will fit.
1364 */
1365 if (m->m_len > space) {
1366 n = m_split(m, space, M_WAITOK);
1367 } else {
1368 tosend = m->m_len;
1369 n = m->m_next;
1370 p = m;
1371 while (n != NULL && tosend + n->m_len <= space) {
1372 tosend += n->m_len;
1373 p = n;
1374 n = n->m_next;
1375 }
1376 KASSERT(p->m_next == n, ("%s: p not before n",
1377 __func__));
1378 p->m_next = NULL;
1379
1380 KASSERT(m_length(m, NULL) == tosend,
1381 ("%s: length mismatch", __func__));
1382 }
1383 error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL);
1384 if (error != 0) {
1385 m = NULL;
1386 m_freem(n);
1387 goto error;
1388 }
1389 m = n;
1390 SOCKBUF_LOCK(&so->so_snd);
1391 }
1392 SOCKBUF_UNLOCK(&so->so_snd);
1393 kthread_exit();
1394 }
1395
1396 static int
nvmf_soupcall_receive(struct socket * so,void * arg,int waitflag)1397 nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag)
1398 {
1399 struct nvmf_tcp_qpair *qp = arg;
1400
1401 if (soreadable(so))
1402 cv_signal(&qp->rx_cv);
1403 return (SU_OK);
1404 }
1405
1406 static int
nvmf_soupcall_send(struct socket * so,void * arg,int waitflag)1407 nvmf_soupcall_send(struct socket *so, void *arg, int waitflag)
1408 {
1409 struct nvmf_tcp_qpair *qp = arg;
1410
1411 if (sowriteable(so))
1412 cv_signal(&qp->tx_cv);
1413 return (SU_OK);
1414 }
1415
1416 static struct nvmf_qpair *
tcp_allocate_qpair(bool controller,const nvlist_t * nvl)1417 tcp_allocate_qpair(bool controller, const nvlist_t *nvl)
1418 {
1419 struct nvmf_tcp_qpair *qp;
1420 struct socket *so;
1421 struct file *fp;
1422 cap_rights_t rights;
1423 int error;
1424
1425 if (!nvlist_exists_number(nvl, "fd") ||
1426 !nvlist_exists_number(nvl, "rxpda") ||
1427 !nvlist_exists_number(nvl, "txpda") ||
1428 !nvlist_exists_bool(nvl, "header_digests") ||
1429 !nvlist_exists_bool(nvl, "data_digests") ||
1430 !nvlist_exists_number(nvl, "maxr2t") ||
1431 !nvlist_exists_number(nvl, "maxh2cdata") ||
1432 !nvlist_exists_number(nvl, "max_icd"))
1433 return (NULL);
1434
1435 error = fget(curthread, nvlist_get_number(nvl, "fd"),
1436 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
1437 if (error != 0)
1438 return (NULL);
1439 if (fp->f_type != DTYPE_SOCKET) {
1440 fdrop(fp, curthread);
1441 return (NULL);
1442 }
1443 so = fp->f_data;
1444 if (so->so_type != SOCK_STREAM ||
1445 so->so_proto->pr_protocol != IPPROTO_TCP) {
1446 fdrop(fp, curthread);
1447 return (NULL);
1448 }
1449
1450 /* Claim socket from file descriptor. */
1451 fp->f_ops = &badfileops;
1452 fp->f_data = NULL;
1453 fdrop(fp, curthread);
1454
1455 qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO);
1456 qp->so = so;
1457 refcount_init(&qp->refs, 1);
1458 qp->txpda = nvlist_get_number(nvl, "txpda");
1459 qp->rxpda = nvlist_get_number(nvl, "rxpda");
1460 qp->header_digests = nvlist_get_bool(nvl, "header_digests");
1461 qp->data_digests = nvlist_get_bool(nvl, "data_digests");
1462 qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
1463 if (controller)
1464 qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
1465 qp->max_tx_data = tcp_max_transmit_data;
1466 if (!controller) {
1467 qp->max_tx_data = min(qp->max_tx_data,
1468 nvlist_get_number(nvl, "maxh2cdata"));
1469 qp->max_icd = nvlist_get_number(nvl, "max_icd");
1470 }
1471
1472 if (controller) {
1473 /* Use the SUCCESS flag if SQ flow control is disabled. */
1474 qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
1475
1476 /* NB: maxr2t is 0's based. */
1477 qp->num_ttags = MIN((u_int)UINT16_MAX + 1,
1478 nvlist_get_number(nvl, "qsize") *
1479 ((uint64_t)qp->maxr2t + 1));
1480 qp->open_ttags = mallocarray(qp->num_ttags,
1481 sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO);
1482 }
1483
1484 TAILQ_INIT(&qp->rx_buffers.head);
1485 TAILQ_INIT(&qp->tx_buffers.head);
1486 mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF);
1487 mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF);
1488
1489 cv_init(&qp->rx_cv, "-");
1490 cv_init(&qp->tx_cv, "-");
1491 mbufq_init(&qp->tx_pdus, 0);
1492 STAILQ_INIT(&qp->tx_capsules);
1493
1494 /* Register socket upcalls. */
1495 SOCKBUF_LOCK(&so->so_rcv);
1496 soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp);
1497 SOCKBUF_UNLOCK(&so->so_rcv);
1498 SOCKBUF_LOCK(&so->so_snd);
1499 soupcall_set(so, SO_SND, nvmf_soupcall_send, qp);
1500 SOCKBUF_UNLOCK(&so->so_snd);
1501
1502 /* Spin up kthreads. */
1503 error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0,
1504 "nvmef tcp rx");
1505 if (error != 0) {
1506 tcp_free_qpair(&qp->qp);
1507 return (NULL);
1508 }
1509 error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0,
1510 "nvmef tcp tx");
1511 if (error != 0) {
1512 tcp_free_qpair(&qp->qp);
1513 return (NULL);
1514 }
1515
1516 return (&qp->qp);
1517 }
1518
1519 static void
tcp_release_qpair(struct nvmf_tcp_qpair * qp)1520 tcp_release_qpair(struct nvmf_tcp_qpair *qp)
1521 {
1522 if (refcount_release(&qp->refs))
1523 free(qp, M_NVMF_TCP);
1524 }
1525
1526 static void
tcp_free_qpair(struct nvmf_qpair * nq)1527 tcp_free_qpair(struct nvmf_qpair *nq)
1528 {
1529 struct nvmf_tcp_qpair *qp = TQP(nq);
1530 struct nvmf_tcp_command_buffer *ncb, *cb;
1531 struct nvmf_tcp_capsule *ntc, *tc;
1532 struct socket *so = qp->so;
1533
1534 /* Shut down kthreads and clear upcalls */
1535 SOCKBUF_LOCK(&so->so_snd);
1536 qp->tx_shutdown = true;
1537 if (qp->tx_thread != NULL) {
1538 cv_signal(&qp->tx_cv);
1539 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
1540 "nvtcptx", 0);
1541 }
1542 soupcall_clear(so, SO_SND);
1543 SOCKBUF_UNLOCK(&so->so_snd);
1544
1545 SOCKBUF_LOCK(&so->so_rcv);
1546 qp->rx_shutdown = true;
1547 if (qp->rx_thread != NULL) {
1548 cv_signal(&qp->rx_cv);
1549 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
1550 "nvtcprx", 0);
1551 }
1552 soupcall_clear(so, SO_RCV);
1553 SOCKBUF_UNLOCK(&so->so_rcv);
1554
1555 STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) {
1556 nvmf_abort_capsule_data(&tc->nc, ECONNABORTED);
1557 tcp_release_capsule(tc);
1558 }
1559 mbufq_drain(&qp->tx_pdus);
1560
1561 cv_destroy(&qp->tx_cv);
1562 cv_destroy(&qp->rx_cv);
1563
1564 if (qp->open_ttags != NULL) {
1565 for (u_int i = 0; i < qp->num_ttags; i++) {
1566 cb = qp->open_ttags[i];
1567 if (cb != NULL) {
1568 cb->tc->active_r2ts--;
1569 cb->error = ECONNABORTED;
1570 tcp_release_command_buffer(cb);
1571 }
1572 }
1573 free(qp->open_ttags, M_NVMF_TCP);
1574 }
1575
1576 mtx_lock(&qp->rx_buffers.lock);
1577 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
1578 tcp_remove_command_buffer(&qp->rx_buffers, cb);
1579 mtx_unlock(&qp->rx_buffers.lock);
1580 #ifdef INVARIANTS
1581 if (cb->tc != NULL)
1582 cb->tc->pending_r2ts--;
1583 #endif
1584 cb->error = ECONNABORTED;
1585 tcp_release_command_buffer(cb);
1586 mtx_lock(&qp->rx_buffers.lock);
1587 }
1588 mtx_destroy(&qp->rx_buffers.lock);
1589
1590 mtx_lock(&qp->tx_buffers.lock);
1591 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
1592 tcp_remove_command_buffer(&qp->tx_buffers, cb);
1593 mtx_unlock(&qp->tx_buffers.lock);
1594 cb->error = ECONNABORTED;
1595 tcp_release_command_buffer(cb);
1596 mtx_lock(&qp->tx_buffers.lock);
1597 }
1598 mtx_destroy(&qp->tx_buffers.lock);
1599
1600 soclose(so);
1601
1602 tcp_release_qpair(qp);
1603 }
1604
1605 static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * nq,int how)1606 tcp_allocate_capsule(struct nvmf_qpair *nq, int how)
1607 {
1608 struct nvmf_tcp_qpair *qp = TQP(nq);
1609 struct nvmf_tcp_capsule *tc;
1610
1611 tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO);
1612 if (tc == NULL)
1613 return (NULL);
1614 refcount_init(&tc->refs, 1);
1615 refcount_acquire(&qp->refs);
1616 return (&tc->nc);
1617 }
1618
1619 static void
tcp_release_capsule(struct nvmf_tcp_capsule * tc)1620 tcp_release_capsule(struct nvmf_tcp_capsule *tc)
1621 {
1622 struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair);
1623
1624 if (!refcount_release(&tc->refs))
1625 return;
1626
1627 MPASS(tc->active_r2ts == 0);
1628 MPASS(tc->pending_r2ts == 0);
1629
1630 nvmf_tcp_free_pdu(&tc->rx_pdu);
1631 free(tc, M_NVMF_TCP);
1632 tcp_release_qpair(qp);
1633 }
1634
1635 static void
tcp_free_capsule(struct nvmf_capsule * nc)1636 tcp_free_capsule(struct nvmf_capsule *nc)
1637 {
1638 struct nvmf_tcp_capsule *tc = TCAP(nc);
1639
1640 tcp_release_capsule(tc);
1641 }
1642
1643 static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1644 tcp_transmit_capsule(struct nvmf_capsule *nc)
1645 {
1646 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1647 struct nvmf_tcp_capsule *tc = TCAP(nc);
1648 struct socket *so = qp->so;
1649
1650 refcount_acquire(&tc->refs);
1651 SOCKBUF_LOCK(&so->so_snd);
1652 STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link);
1653 if (sowriteable(so))
1654 cv_signal(&qp->tx_cv);
1655 SOCKBUF_UNLOCK(&so->so_snd);
1656 return (0);
1657 }
1658
1659 static uint8_t
tcp_validate_command_capsule(struct nvmf_capsule * nc)1660 tcp_validate_command_capsule(struct nvmf_capsule *nc)
1661 {
1662 struct nvmf_tcp_capsule *tc = TCAP(nc);
1663 struct nvme_sgl_descriptor *sgl;
1664
1665 KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
1666
1667 sgl = &nc->nc_sqe.sgl;
1668 switch (sgl->type) {
1669 case NVME_SGL_TYPE_ICD:
1670 if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1671 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1672 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1673 }
1674 break;
1675 case NVME_SGL_TYPE_COMMAND_BUFFER:
1676 if (tc->rx_pdu.data_len != 0) {
1677 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1678 return (NVME_SC_INVALID_FIELD);
1679 }
1680 break;
1681 default:
1682 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1683 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1684 }
1685
1686 if (sgl->address != 0) {
1687 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1688 return (NVME_SC_SGL_OFFSET_INVALID);
1689 }
1690
1691 return (NVME_SC_SUCCESS);
1692 }
1693
1694 static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1695 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1696 {
1697 MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
1698 return (le32toh(nc->nc_sqe.sgl.length));
1699 }
1700
1701 static void
tcp_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1702 tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
1703 struct nvmf_io_request *io)
1704 {
1705 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1706 struct nvmf_tcp_capsule *tc = TCAP(nc);
1707 struct nvmf_tcp_command_buffer *cb;
1708
1709 cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len,
1710 nc->nc_sqe.cid);
1711
1712 cb->tc = tc;
1713 refcount_acquire(&tc->refs);
1714
1715 /*
1716 * If this command has too many active R2Ts or there are no
1717 * available transfer tags, queue the request for later.
1718 *
1719 * NB: maxr2t is 0's based.
1720 */
1721 mtx_lock(&qp->rx_buffers.lock);
1722 if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) {
1723 #ifdef INVARIANTS
1724 tc->pending_r2ts++;
1725 #endif
1726 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
1727 mtx_unlock(&qp->rx_buffers.lock);
1728 return;
1729 }
1730
1731 nvmf_tcp_allocate_ttag(qp, cb);
1732 mtx_unlock(&qp->rx_buffers.lock);
1733
1734 tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
1735 }
1736
1737 static void
tcp_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1738 tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
1739 struct nvmf_io_request *io)
1740 {
1741 struct nvmf_tcp_capsule *tc = TCAP(nc);
1742
1743 mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset,
1744 io->io_len, io, 0);
1745 nvmf_complete_io_request(io, io->io_len, 0);
1746 }
1747
1748 static int
tcp_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)1749 tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1750 struct nvmf_io_request *io)
1751 {
1752 struct nvme_sgl_descriptor *sgl;
1753 size_t data_len;
1754
1755 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1756 !nc->nc_qpair->nq_controller)
1757 return (EINVAL);
1758
1759 sgl = &nc->nc_sqe.sgl;
1760 data_len = le32toh(sgl->length);
1761 if (data_offset + io->io_len > data_len)
1762 return (EFBIG);
1763
1764 if (sgl->type == NVME_SGL_TYPE_ICD)
1765 tcp_receive_icd_data(nc, data_offset, io);
1766 else
1767 tcp_receive_r2t_data(nc, data_offset, io);
1768 return (0);
1769 }
1770
1771 /* NB: cid is little-endian already. */
1772 static void
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)1773 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset,
1774 struct mbuf *m, size_t len, bool last_pdu, bool success)
1775 {
1776 struct nvme_tcp_c2h_data_hdr c2h;
1777 struct mbuf *top;
1778
1779 memset(&c2h, 0, sizeof(c2h));
1780 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1781 if (last_pdu)
1782 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1783 if (success)
1784 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1785 c2h.cccid = cid;
1786 c2h.datao = htole32(data_offset);
1787 c2h.datal = htole32(len);
1788
1789 top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
1790 nvmf_tcp_write_pdu(qp, top);
1791 }
1792
1793 static u_int
tcp_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)1794 tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
1795 struct mbuf *m, size_t len)
1796 {
1797 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1798 struct nvme_sgl_descriptor *sgl;
1799 uint32_t data_len;
1800 bool last_pdu, last_xfer;
1801
1802 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
1803 !qp->qp.nq_controller) {
1804 m_freem(m);
1805 return (NVME_SC_INVALID_FIELD);
1806 }
1807
1808 sgl = &nc->nc_sqe.sgl;
1809 data_len = le32toh(sgl->length);
1810 if (data_offset + len > data_len) {
1811 m_freem(m);
1812 return (NVME_SC_INVALID_FIELD);
1813 }
1814 last_xfer = (data_offset + len == data_len);
1815
1816 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1817 m_freem(m);
1818 return (NVME_SC_INVALID_FIELD);
1819 }
1820
1821 KASSERT(data_offset == TCAP(nc)->tx_data_offset,
1822 ("%s: starting data_offset %u doesn't match end of previous xfer %u",
1823 __func__, data_offset, TCAP(nc)->tx_data_offset));
1824
1825 /* Queue one more C2H_DATA PDUs containing the data from 'm'. */
1826 while (m != NULL) {
1827 struct mbuf *n;
1828 uint32_t todo;
1829
1830 if (m->m_len > qp->max_tx_data) {
1831 n = m_split(m, qp->max_tx_data, M_WAITOK);
1832 todo = m->m_len;
1833 } else {
1834 struct mbuf *p;
1835
1836 todo = m->m_len;
1837 p = m;
1838 n = p->m_next;
1839 while (n != NULL) {
1840 if (todo + n->m_len > qp->max_tx_data) {
1841 p->m_next = NULL;
1842 break;
1843 }
1844 todo += n->m_len;
1845 p = n;
1846 n = p->m_next;
1847 }
1848 MPASS(m_length(m, NULL) == todo);
1849 }
1850
1851 last_pdu = (n == NULL && last_xfer);
1852 tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
1853 last_pdu, last_pdu && qp->send_success);
1854
1855 data_offset += todo;
1856 data_len -= todo;
1857 m = n;
1858 }
1859 MPASS(data_len == 0);
1860
1861 #ifdef INVARIANTS
1862 TCAP(nc)->tx_data_offset = data_offset;
1863 #endif
1864 if (!last_xfer)
1865 return (NVMF_MORE);
1866 else if (qp->send_success)
1867 return (NVMF_SUCCESS_SENT);
1868 else
1869 return (NVME_SC_SUCCESS);
1870 }
1871
1872 struct nvmf_transport_ops tcp_ops = {
1873 .allocate_qpair = tcp_allocate_qpair,
1874 .free_qpair = tcp_free_qpair,
1875 .allocate_capsule = tcp_allocate_capsule,
1876 .free_capsule = tcp_free_capsule,
1877 .transmit_capsule = tcp_transmit_capsule,
1878 .validate_command_capsule = tcp_validate_command_capsule,
1879 .capsule_data_len = tcp_capsule_data_len,
1880 .receive_controller_data = tcp_receive_controller_data,
1881 .send_controller_data = tcp_send_controller_data,
1882 .trtype = NVMF_TRTYPE_TCP,
1883 .priority = 0,
1884 };
1885
1886 NVMF_TRANSPORT(tcp, tcp_ops);
1887