1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/endian.h>
9 #include <sys/gsb_crc32.h>
10 #include <sys/queue.h>
11 #include <sys/uio.h>
12 #include <assert.h>
13 #include <errno.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <unistd.h>
18
19 #include "libnvmf.h"
20 #include "internal.h"
21 #include "nvmf_tcp.h"
22
23 struct nvmf_tcp_qpair;
24
25 struct nvmf_tcp_command_buffer {
26 struct nvmf_tcp_qpair *qp;
27
28 void *data;
29 size_t data_len;
30 size_t data_xfered;
31 uint32_t data_offset;
32
33 uint16_t cid;
34 uint16_t ttag;
35
36 LIST_ENTRY(nvmf_tcp_command_buffer) link;
37 };
38
39 LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
40
41 struct nvmf_tcp_association {
42 struct nvmf_association na;
43
44 uint32_t ioccsz;
45 };
46
47 struct nvmf_tcp_rxpdu {
48 struct nvme_tcp_common_pdu_hdr *hdr;
49 uint32_t data_len;
50 };
51
52 struct nvmf_tcp_capsule {
53 struct nvmf_capsule nc;
54
55 struct nvmf_tcp_rxpdu rx_pdu;
56 struct nvmf_tcp_command_buffer *cb;
57
58 TAILQ_ENTRY(nvmf_tcp_capsule) link;
59 };
60
61 struct nvmf_tcp_qpair {
62 struct nvmf_qpair qp;
63 int s;
64
65 uint8_t txpda;
66 uint8_t rxpda;
67 bool header_digests;
68 bool data_digests;
69 uint32_t maxr2t;
70 uint32_t maxh2cdata;
71 uint32_t max_icd; /* Host only */
72 uint16_t next_ttag; /* Controller only */
73
74 struct nvmf_tcp_command_buffer_list tx_buffers;
75 struct nvmf_tcp_command_buffer_list rx_buffers;
76 TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
77 };
78
79 #define TASSOC(nc) ((struct nvmf_tcp_association *)(na))
80 #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc))
81 #define CTCAP(nc) ((const struct nvmf_tcp_capsule *)(nc))
82 #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp))
83
84 static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
85
86 static uint32_t
compute_digest(const void * buf,size_t len)87 compute_digest(const void *buf, size_t len)
88 {
89 return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
90 }
91
92 static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,void * data,uint32_t data_offset,size_t data_len,uint16_t cid,uint16_t ttag,bool receive)93 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
94 uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
95 bool receive)
96 {
97 struct nvmf_tcp_command_buffer *cb;
98
99 cb = malloc(sizeof(*cb));
100 cb->qp = qp;
101 cb->data = data;
102 cb->data_offset = data_offset;
103 cb->data_len = data_len;
104 cb->data_xfered = 0;
105 cb->cid = cid;
106 cb->ttag = ttag;
107
108 if (receive)
109 LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
110 else
111 LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
112 return (cb);
113 }
114
115 static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)116 tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
117 bool receive)
118 {
119 struct nvmf_tcp_command_buffer_list *list;
120 struct nvmf_tcp_command_buffer *cb;
121
122 list = receive ? &qp->rx_buffers : &qp->tx_buffers;
123 LIST_FOREACH(cb, list, link) {
124 if (cb->cid == cid && cb->ttag == ttag)
125 return (cb);
126 }
127 return (NULL);
128 }
129
130 static void
tcp_purge_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)131 tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
132 bool receive)
133 {
134 struct nvmf_tcp_command_buffer *cb;
135
136 cb = tcp_find_command_buffer(qp, cid, ttag, receive);
137 if (cb != NULL)
138 LIST_REMOVE(cb, link);
139 }
140
141 static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)142 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
143 {
144 LIST_REMOVE(cb, link);
145 free(cb);
146 }
147
148 static int
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,const void * pdu,size_t len)149 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
150 {
151 ssize_t nwritten;
152 const char *cp;
153
154 cp = pdu;
155 while (len != 0) {
156 nwritten = write(qp->s, cp, len);
157 if (nwritten < 0)
158 return (errno);
159 len -= nwritten;
160 cp += nwritten;
161 }
162 return (0);
163 }
164
165 static int
nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair * qp,struct iovec * iov,u_int iovcnt,size_t len)166 nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
167 u_int iovcnt, size_t len)
168 {
169 ssize_t nwritten;
170
171 for (;;) {
172 nwritten = writev(qp->s, iov, iovcnt);
173 if (nwritten < 0)
174 return (errno);
175
176 len -= nwritten;
177 if (len == 0)
178 return (0);
179
180 while (iov->iov_len <= (size_t)nwritten) {
181 nwritten -= iov->iov_len;
182 iovcnt--;
183 iov++;
184 }
185
186 iov->iov_base = (char *)iov->iov_base + nwritten;
187 iov->iov_len -= nwritten;
188 }
189 }
190
191 static void
nvmf_tcp_report_error(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,const void * rx_pdu,size_t pdu_len,u_int hlen)192 nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
193 uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
194 {
195 struct nvme_tcp_term_req_hdr hdr;
196 struct iovec iov[2];
197
198 if (hlen != 0) {
199 if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
200 hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
201 if (hlen > pdu_len)
202 hlen = pdu_len;
203 }
204
205 memset(&hdr, 0, sizeof(hdr));
206 hdr.common.pdu_type = na->na_controller ?
207 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
208 hdr.common.hlen = sizeof(hdr);
209 hdr.common.plen = sizeof(hdr) + hlen;
210 hdr.fes = htole16(fes);
211 le32enc(hdr.fei, fei);
212 iov[0].iov_base = &hdr;
213 iov[0].iov_len = sizeof(hdr);
214 iov[1].iov_base = __DECONST(void *, rx_pdu);
215 iov[1].iov_len = hlen;
216
217 (void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
218 close(qp->s);
219 qp->s = -1;
220 }
221
222 static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu,size_t pdu_len)223 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
224 size_t pdu_len)
225 {
226 const struct nvme_tcp_common_pdu_hdr *ch;
227 uint32_t data_len, fei, plen;
228 uint32_t digest, rx_digest;
229 u_int hlen;
230 int error;
231 uint16_t fes;
232
233 /* Determine how large of a PDU header to return for errors. */
234 ch = pdu->hdr;
235 hlen = ch->hlen;
236 plen = le32toh(ch->plen);
237 if (hlen < sizeof(*ch) || hlen > plen)
238 hlen = sizeof(*ch);
239
240 error = nvmf_tcp_validate_pdu_header(ch,
241 qp->qp.nq_association->na_controller, qp->header_digests,
242 qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
243 if (error != 0) {
244 if (error == ECONNRESET) {
245 close(qp->s);
246 qp->s = -1;
247 } else {
248 nvmf_tcp_report_error(qp->qp.nq_association, qp,
249 fes, fei, ch, pdu_len, hlen);
250 }
251 return (error);
252 }
253
254 /* Check header digest if present. */
255 if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
256 digest = compute_digest(ch, ch->hlen);
257 memcpy(&rx_digest, (const char *)ch + ch->hlen,
258 sizeof(rx_digest));
259 if (digest != rx_digest) {
260 printf("NVMe/TCP: Header digest mismatch\n");
261 nvmf_tcp_report_error(qp->qp.nq_association, qp,
262 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
263 pdu_len, hlen);
264 return (EBADMSG);
265 }
266 }
267
268 /* Check data digest if present. */
269 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
270 digest = compute_digest((const char *)ch + ch->pdo, data_len);
271 memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
272 sizeof(rx_digest));
273 if (digest != rx_digest) {
274 printf("NVMe/TCP: Data digest mismatch\n");
275 return (EBADMSG);
276 }
277 }
278
279 pdu->data_len = data_len;
280 return (0);
281 }
282
283 /*
284 * Read data from a socket, retrying until the data has been fully
285 * read or an error occurs.
286 */
287 static int
nvmf_tcp_read_buffer(int s,void * buf,size_t len)288 nvmf_tcp_read_buffer(int s, void *buf, size_t len)
289 {
290 ssize_t nread;
291 char *cp;
292
293 cp = buf;
294 while (len != 0) {
295 nread = read(s, cp, len);
296 if (nread < 0)
297 return (errno);
298 if (nread == 0)
299 return (ECONNRESET);
300 len -= nread;
301 cp += nread;
302 }
303 return (0);
304 }
305
306 static int
nvmf_tcp_read_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)307 nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
308 {
309 struct nvme_tcp_common_pdu_hdr ch;
310 uint32_t plen;
311 int error;
312
313 memset(pdu, 0, sizeof(*pdu));
314 error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
315 if (error != 0)
316 return (error);
317
318 plen = le32toh(ch.plen);
319
320 /*
321 * Validate a header with garbage lengths to trigger
322 * an error message without reading more.
323 */
324 if (plen < sizeof(ch) || ch.hlen > plen) {
325 pdu->hdr = &ch;
326 error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
327 pdu->hdr = NULL;
328 assert(error != 0);
329 return (error);
330 }
331
332 /* Read the rest of the PDU. */
333 pdu->hdr = malloc(plen);
334 memcpy(pdu->hdr, &ch, sizeof(ch));
335 error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
336 if (error != 0)
337 return (error);
338 error = nvmf_tcp_validate_pdu(qp, pdu, plen);
339 if (error != 0) {
340 free(pdu->hdr);
341 pdu->hdr = NULL;
342 }
343 return (error);
344 }
345
346 static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)347 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
348 {
349 free(pdu->hdr);
350 pdu->hdr = NULL;
351 }
352
353 static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)354 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
355 {
356 struct nvme_tcp_term_req_hdr *hdr;
357
358 hdr = (void *)pdu->hdr;
359
360 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
361 le16toh(hdr->fes), le32dec(hdr->fei));
362 nvmf_tcp_free_pdu(pdu);
363 return (ECONNRESET);
364 }
365
366 static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)367 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
368 struct nvmf_tcp_rxpdu *pdu)
369 {
370 struct nvme_tcp_cmd *cmd;
371 struct nvmf_capsule *nc;
372 struct nvmf_tcp_capsule *tc;
373
374 cmd = (void *)pdu->hdr;
375
376 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
377 if (nc == NULL)
378 return (ENOMEM);
379
380 tc = TCAP(nc);
381 tc->rx_pdu = *pdu;
382
383 TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
384 return (0);
385 }
386
387 static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)388 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
389 struct nvmf_tcp_rxpdu *pdu)
390 {
391 struct nvme_tcp_rsp *rsp;
392 struct nvmf_capsule *nc;
393 struct nvmf_tcp_capsule *tc;
394
395 rsp = (void *)pdu->hdr;
396
397 nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
398 if (nc == NULL)
399 return (ENOMEM);
400
401 nc->nc_sqhd_valid = true;
402 tc = TCAP(nc);
403 tc->rx_pdu = *pdu;
404
405 TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
406
407 /*
408 * Once the CQE has been received, no further transfers to the
409 * command buffer for the associated CID can occur.
410 */
411 tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
412 tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
413
414 return (0);
415 }
416
417 /*
418 * Construct and send a PDU that contains an optional data payload.
419 * This includes dealing with digests and the length fields in the
420 * common header.
421 */
422 static int
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,void * data,uint32_t data_len)423 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
424 void *data, uint32_t data_len)
425 {
426 struct nvme_tcp_common_pdu_hdr *ch;
427 struct iovec iov[5];
428 u_int iovcnt;
429 uint32_t header_digest, data_digest, pad, pdo, plen;
430
431 plen = hlen;
432 if (qp->header_digests)
433 plen += sizeof(header_digest);
434 if (data_len != 0) {
435 pdo = roundup(plen, qp->txpda);
436 pad = pdo - plen;
437 plen = pdo + data_len;
438 if (qp->data_digests)
439 plen += sizeof(data_digest);
440 } else {
441 assert(data == NULL);
442 pdo = 0;
443 pad = 0;
444 }
445
446 ch = hdr;
447 ch->hlen = hlen;
448 if (qp->header_digests)
449 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
450 if (qp->data_digests && data_len != 0)
451 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
452 ch->pdo = pdo;
453 ch->plen = htole32(plen);
454
455 /* CH + PSH */
456 iov[0].iov_base = hdr;
457 iov[0].iov_len = hlen;
458 iovcnt = 1;
459
460 /* HDGST */
461 if (qp->header_digests) {
462 header_digest = compute_digest(hdr, hlen);
463 iov[iovcnt].iov_base = &header_digest;
464 iov[iovcnt].iov_len = sizeof(header_digest);
465 iovcnt++;
466 }
467
468 if (pad != 0) {
469 /* PAD */
470 iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
471 iov[iovcnt].iov_len = pad;
472 iovcnt++;
473 }
474
475 if (data_len != 0) {
476 /* DATA */
477 iov[iovcnt].iov_base = data;
478 iov[iovcnt].iov_len = data_len;
479 iovcnt++;
480
481 /* DDGST */
482 if (qp->data_digests) {
483 data_digest = compute_digest(data, data_len);
484 iov[iovcnt].iov_base = &data_digest;
485 iov[iovcnt].iov_len = sizeof(data_digest);
486 iovcnt++;
487 }
488 }
489
490 return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
491 }
492
493 static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)494 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
495 {
496 struct nvme_tcp_h2c_data_hdr *h2c;
497 struct nvmf_tcp_command_buffer *cb;
498 uint32_t data_len, data_offset;
499 const char *icd;
500
501 h2c = (void *)pdu->hdr;
502 if (le32toh(h2c->datal) > qp->maxh2cdata) {
503 nvmf_tcp_report_error(qp->qp.nq_association, qp,
504 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
505 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
506 nvmf_tcp_free_pdu(pdu);
507 return (EBADMSG);
508 }
509
510 cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
511 if (cb == NULL) {
512 nvmf_tcp_report_error(qp->qp.nq_association, qp,
513 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
514 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
515 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
516 nvmf_tcp_free_pdu(pdu);
517 return (EBADMSG);
518 }
519
520 data_len = le32toh(h2c->datal);
521 if (data_len != pdu->data_len) {
522 nvmf_tcp_report_error(qp->qp.nq_association, qp,
523 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
524 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
525 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
526 nvmf_tcp_free_pdu(pdu);
527 return (EBADMSG);
528 }
529
530 data_offset = le32toh(h2c->datao);
531 if (data_offset < cb->data_offset ||
532 data_offset + data_len > cb->data_offset + cb->data_len) {
533 nvmf_tcp_report_error(qp->qp.nq_association, qp,
534 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
535 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
536 nvmf_tcp_free_pdu(pdu);
537 return (EBADMSG);
538 }
539
540 if (data_offset != cb->data_offset + cb->data_xfered) {
541 nvmf_tcp_report_error(qp->qp.nq_association, qp,
542 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
543 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
544 nvmf_tcp_free_pdu(pdu);
545 return (EBADMSG);
546 }
547
548 if ((cb->data_xfered + data_len == cb->data_len) !=
549 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
550 nvmf_tcp_report_error(qp->qp.nq_association, qp,
551 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
552 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
553 nvmf_tcp_free_pdu(pdu);
554 return (EBADMSG);
555 }
556
557 cb->data_xfered += data_len;
558 data_offset -= cb->data_offset;
559 icd = (const char *)pdu->hdr + pdu->hdr->pdo;
560 memcpy((char *)cb->data + data_offset, icd, data_len);
561
562 nvmf_tcp_free_pdu(pdu);
563 return (0);
564 }
565
566 static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)567 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
568 {
569 struct nvme_tcp_c2h_data_hdr *c2h;
570 struct nvmf_tcp_command_buffer *cb;
571 uint32_t data_len, data_offset;
572 const char *icd;
573
574 c2h = (void *)pdu->hdr;
575
576 cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
577 if (cb == NULL) {
578 /*
579 * XXX: Could be PDU sequence error if cccid is for a
580 * command that doesn't use a command buffer.
581 */
582 nvmf_tcp_report_error(qp->qp.nq_association, qp,
583 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
584 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
585 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
586 nvmf_tcp_free_pdu(pdu);
587 return (EBADMSG);
588 }
589
590 data_len = le32toh(c2h->datal);
591 if (data_len != pdu->data_len) {
592 nvmf_tcp_report_error(qp->qp.nq_association, qp,
593 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
594 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
595 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
596 nvmf_tcp_free_pdu(pdu);
597 return (EBADMSG);
598 }
599
600 data_offset = le32toh(c2h->datao);
601 if (data_offset < cb->data_offset ||
602 data_offset + data_len > cb->data_offset + cb->data_len) {
603 nvmf_tcp_report_error(qp->qp.nq_association, qp,
604 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
605 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
606 nvmf_tcp_free_pdu(pdu);
607 return (EBADMSG);
608 }
609
610 if (data_offset != cb->data_offset + cb->data_xfered) {
611 nvmf_tcp_report_error(qp->qp.nq_association, qp,
612 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
613 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
614 nvmf_tcp_free_pdu(pdu);
615 return (EBADMSG);
616 }
617
618 if ((cb->data_xfered + data_len == cb->data_len) !=
619 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
620 nvmf_tcp_report_error(qp->qp.nq_association, qp,
621 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
622 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
623 nvmf_tcp_free_pdu(pdu);
624 return (EBADMSG);
625 }
626
627 cb->data_xfered += data_len;
628 data_offset -= cb->data_offset;
629 icd = (const char *)pdu->hdr + pdu->hdr->pdo;
630 memcpy((char *)cb->data + data_offset, icd, data_len);
631
632 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
633 struct nvme_completion cqe;
634 struct nvmf_tcp_capsule *tc;
635 struct nvmf_capsule *nc;
636
637 memset(&cqe, 0, sizeof(cqe));
638 cqe.cid = cb->cid;
639
640 nc = nvmf_allocate_response(&qp->qp, &cqe);
641 if (nc == NULL) {
642 nvmf_tcp_free_pdu(pdu);
643 return (ENOMEM);
644 }
645 nc->nc_sqhd_valid = false;
646
647 tc = TCAP(nc);
648 TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
649 }
650
651 nvmf_tcp_free_pdu(pdu);
652 return (0);
653 }
654
655 /* NB: cid and ttag and little-endian already. */
656 static int
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)657 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
658 uint32_t data_offset, void *buf, size_t len, bool last_pdu)
659 {
660 struct nvme_tcp_h2c_data_hdr h2c;
661
662 memset(&h2c, 0, sizeof(h2c));
663 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
664 if (last_pdu)
665 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
666 h2c.cccid = cid;
667 h2c.ttag = ttag;
668 h2c.datao = htole32(data_offset);
669 h2c.datal = htole32(len);
670
671 return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
672 }
673
674 /* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
675 static int
tcp_send_h2c_pdus(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)676 tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
677 uint32_t data_offset, void *buf, size_t len, bool last_pdu)
678 {
679 char *p;
680
681 p = buf;
682 while (len != 0) {
683 size_t todo;
684 int error;
685
686 todo = len;
687 if (todo > qp->maxh2cdata)
688 todo = qp->maxh2cdata;
689 error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
690 last_pdu && todo == len);
691 if (error != 0)
692 return (error);
693 p += todo;
694 len -= todo;
695 }
696 return (0);
697 }
698
699 static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)700 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
701 {
702 struct nvmf_tcp_command_buffer *cb;
703 struct nvme_tcp_r2t_hdr *r2t;
704 uint32_t data_len, data_offset;
705 int error;
706
707 r2t = (void *)pdu->hdr;
708
709 cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
710 if (cb == NULL) {
711 nvmf_tcp_report_error(qp->qp.nq_association, qp,
712 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
713 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
714 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
715 nvmf_tcp_free_pdu(pdu);
716 return (EBADMSG);
717 }
718
719 data_offset = le32toh(r2t->r2to);
720 if (data_offset != cb->data_xfered) {
721 nvmf_tcp_report_error(qp->qp.nq_association, qp,
722 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
723 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
724 nvmf_tcp_free_pdu(pdu);
725 return (EBADMSG);
726 }
727
728 /*
729 * XXX: The spec does not specify how to handle R2T tranfers
730 * out of range of the original command.
731 */
732 data_len = le32toh(r2t->r2tl);
733 if (data_offset + data_len > cb->data_len) {
734 nvmf_tcp_report_error(qp->qp.nq_association, qp,
735 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
736 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
737 nvmf_tcp_free_pdu(pdu);
738 return (EBADMSG);
739 }
740
741 cb->data_xfered += data_len;
742
743 /*
744 * Write out one or more H2C_DATA PDUs containing the
745 * requested data.
746 */
747 error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
748 data_offset, (char *)cb->data + data_offset, data_len, true);
749
750 nvmf_tcp_free_pdu(pdu);
751 return (error);
752 }
753
754 static int
nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair * qp)755 nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
756 {
757 struct nvmf_tcp_rxpdu pdu;
758 int error;
759
760 error = nvmf_tcp_read_pdu(qp, &pdu);
761 if (error != 0)
762 return (error);
763
764 switch (pdu.hdr->pdu_type) {
765 default:
766 __unreachable();
767 break;
768 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
769 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
770 return (nvmf_tcp_handle_term_req(&pdu));
771 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
772 return (nvmf_tcp_save_command_capsule(qp, &pdu));
773 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
774 return (nvmf_tcp_save_response_capsule(qp, &pdu));
775 case NVME_TCP_PDU_TYPE_H2C_DATA:
776 return (nvmf_tcp_handle_h2c_data(qp, &pdu));
777 case NVME_TCP_PDU_TYPE_C2H_DATA:
778 return (nvmf_tcp_handle_c2h_data(qp, &pdu));
779 case NVME_TCP_PDU_TYPE_R2T:
780 return (nvmf_tcp_handle_r2t(qp, &pdu));
781 }
782 }
783
784 static bool
nvmf_tcp_validate_ic_pdu(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,size_t pdu_len)785 nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
786 const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
787 {
788 const struct nvme_tcp_ic_req *pdu;
789 uint32_t plen;
790 u_int hlen;
791
792 /* Determine how large of a PDU header to return for errors. */
793 hlen = ch->hlen;
794 plen = le32toh(ch->plen);
795 if (hlen < sizeof(*ch) || hlen > plen)
796 hlen = sizeof(*ch);
797
798 /*
799 * Errors must be reported for the lowest incorrect field
800 * first, so validate fields in order.
801 */
802
803 /* Validate pdu_type. */
804
805 /* Controllers only receive PDUs with a PDU direction of 0. */
806 if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) {
807 na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
808 nvmf_tcp_report_error(na, qp,
809 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
810 hlen);
811 return (false);
812 }
813
814 switch (ch->pdu_type) {
815 case NVME_TCP_PDU_TYPE_IC_REQ:
816 case NVME_TCP_PDU_TYPE_IC_RESP:
817 break;
818 default:
819 na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
820 nvmf_tcp_report_error(na, qp,
821 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
822 hlen);
823 return (false);
824 }
825
826 /* Validate flags. */
827 if (ch->flags != 0) {
828 na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
829 ch->flags);
830 nvmf_tcp_report_error(na, qp,
831 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
832 hlen);
833 return (false);
834 }
835
836 /* Validate hlen. */
837 if (ch->hlen != 128) {
838 na_error(na, "NVMe/TCP: Invalid PDU header length %u",
839 ch->hlen);
840 nvmf_tcp_report_error(na, qp,
841 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
842 hlen);
843 return (false);
844 }
845
846 /* Validate pdo. */
847 if (ch->pdo != 0) {
848 na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
849 nvmf_tcp_report_error(na, qp,
850 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
851 hlen);
852 return (false);
853 }
854
855 /* Validate plen. */
856 if (plen != 128) {
857 na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
858 nvmf_tcp_report_error(na, qp,
859 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
860 hlen);
861 return (false);
862 }
863
864 /* Validate fields common to both ICReq and ICResp. */
865 pdu = (const struct nvme_tcp_ic_req *)ch;
866 if (le16toh(pdu->pfv) != 0) {
867 na_error(na, "NVMe/TCP: Unsupported PDU version %u",
868 le16toh(pdu->pfv));
869 nvmf_tcp_report_error(na, qp,
870 NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
871 8, ch, pdu_len, hlen);
872 return (false);
873 }
874
875 if (pdu->hpda > NVME_TCP_HPDA_MAX) {
876 na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
877 nvmf_tcp_report_error(na, qp,
878 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
879 hlen);
880 return (false);
881 }
882
883 if (pdu->dgst.bits.reserved != 0) {
884 na_error(na, "NVMe/TCP: Invalid digest settings");
885 nvmf_tcp_report_error(na, qp,
886 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
887 hlen);
888 return (false);
889 }
890
891 return (true);
892 }
893
894 static bool
nvmf_tcp_read_ic_req(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_req * pdu)895 nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
896 struct nvme_tcp_ic_req *pdu)
897 {
898 int error;
899
900 error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
901 if (error != 0) {
902 na_error(na, "NVMe/TCP: Failed to read IC request: %s",
903 strerror(error));
904 return (false);
905 }
906
907 return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
908 }
909
910 static bool
nvmf_tcp_read_ic_resp(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_resp * pdu)911 nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
912 struct nvme_tcp_ic_resp *pdu)
913 {
914 int error;
915
916 error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
917 if (error != 0) {
918 na_error(na, "NVMe/TCP: Failed to read IC response: %s",
919 strerror(error));
920 return (false);
921 }
922
923 return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
924 }
925
926 static struct nvmf_association *
tcp_allocate_association(bool controller,const struct nvmf_association_params * params)927 tcp_allocate_association(bool controller,
928 const struct nvmf_association_params *params)
929 {
930 struct nvmf_tcp_association *ta;
931
932 if (controller) {
933 /* 7.4.10.3 */
934 if (params->tcp.maxh2cdata < 4096 ||
935 params->tcp.maxh2cdata % 4 != 0)
936 return (NULL);
937 }
938
939 ta = calloc(1, sizeof(*ta));
940
941 return (&ta->na);
942 }
943
944 static void
tcp_update_association(struct nvmf_association * na,const struct nvme_controller_data * cdata)945 tcp_update_association(struct nvmf_association *na,
946 const struct nvme_controller_data *cdata)
947 {
948 struct nvmf_tcp_association *ta = TASSOC(na);
949
950 ta->ioccsz = le32toh(cdata->ioccsz);
951 }
952
953 static void
tcp_free_association(struct nvmf_association * na)954 tcp_free_association(struct nvmf_association *na)
955 {
956 free(na);
957 }
958
959 static bool
tcp_connect(struct nvmf_tcp_qpair * qp,struct nvmf_association * na,bool admin)960 tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
961 {
962 const struct nvmf_association_params *params = &na->na_params;
963 struct nvmf_tcp_association *ta = TASSOC(na);
964 struct nvme_tcp_ic_req ic_req;
965 struct nvme_tcp_ic_resp ic_resp;
966 uint32_t maxh2cdata;
967 int error;
968
969 if (!admin) {
970 if (ta->ioccsz == 0) {
971 na_error(na, "TCP I/O queues require cdata");
972 return (false);
973 }
974 if (ta->ioccsz < 4) {
975 na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
976 return (false);
977 }
978 }
979
980 memset(&ic_req, 0, sizeof(ic_req));
981 ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
982 ic_req.common.hlen = sizeof(ic_req);
983 ic_req.common.plen = htole32(sizeof(ic_req));
984 ic_req.pfv = htole16(0);
985 ic_req.hpda = params->tcp.pda;
986 if (params->tcp.header_digests)
987 ic_req.dgst.bits.hdgst_enable = 1;
988 if (params->tcp.data_digests)
989 ic_req.dgst.bits.ddgst_enable = 1;
990 ic_req.maxr2t = htole32(params->tcp.maxr2t);
991
992 error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
993 if (error != 0) {
994 na_error(na, "Failed to write IC request: %s", strerror(error));
995 return (false);
996 }
997
998 if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
999 return (false);
1000
1001 /* Ensure the controller didn't enable digests we didn't request. */
1002 if ((!params->tcp.header_digests &&
1003 ic_resp.dgst.bits.hdgst_enable != 0) ||
1004 (!params->tcp.data_digests &&
1005 ic_resp.dgst.bits.ddgst_enable != 0)) {
1006 na_error(na, "Controller enabled unrequested digests");
1007 nvmf_tcp_report_error(na, qp,
1008 NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
1009 11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
1010 return (false);
1011 }
1012
1013 /*
1014 * XXX: Is there an upper-bound to enforce here? Perhaps pick
1015 * some large value and report larger values as an unsupported
1016 * parameter?
1017 */
1018 maxh2cdata = le32toh(ic_resp.maxh2cdata);
1019 if (maxh2cdata < 4096 || maxh2cdata % 4 != 0) {
1020 na_error(na, "Invalid MAXH2CDATA %u", maxh2cdata);
1021 nvmf_tcp_report_error(na, qp,
1022 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
1023 sizeof(ic_resp), sizeof(ic_resp));
1024 return (false);
1025 }
1026
1027 qp->rxpda = (params->tcp.pda + 1) * 4;
1028 qp->txpda = (ic_resp.cpda + 1) * 4;
1029 qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1030 qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1031 qp->maxr2t = params->tcp.maxr2t;
1032 qp->maxh2cdata = maxh2cdata;
1033 if (admin)
1034 /* 7.4.3 */
1035 qp->max_icd = 8192;
1036 else
1037 qp->max_icd = (ta->ioccsz - 4) * 16;
1038
1039 return (0);
1040 }
1041
1042 static bool
tcp_accept(struct nvmf_tcp_qpair * qp,struct nvmf_association * na)1043 tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
1044 {
1045 const struct nvmf_association_params *params = &na->na_params;
1046 struct nvme_tcp_ic_req ic_req;
1047 struct nvme_tcp_ic_resp ic_resp;
1048 int error;
1049
1050 if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
1051 return (false);
1052
1053 memset(&ic_resp, 0, sizeof(ic_resp));
1054 ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
1055 ic_resp.common.hlen = sizeof(ic_req);
1056 ic_resp.common.plen = htole32(sizeof(ic_req));
1057 ic_resp.pfv = htole16(0);
1058 ic_resp.cpda = params->tcp.pda;
1059 if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
1060 ic_resp.dgst.bits.hdgst_enable = 1;
1061 if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
1062 ic_resp.dgst.bits.ddgst_enable = 1;
1063 ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
1064
1065 error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
1066 if (error != 0) {
1067 na_error(na, "Failed to write IC response: %s",
1068 strerror(error));
1069 return (false);
1070 }
1071
1072 qp->rxpda = (params->tcp.pda + 1) * 4;
1073 qp->txpda = (ic_req.hpda + 1) * 4;
1074 qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1075 qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1076 qp->maxr2t = le32toh(ic_req.maxr2t);
1077 qp->maxh2cdata = params->tcp.maxh2cdata;
1078 qp->max_icd = 0; /* XXX */
1079 return (0);
1080 }
1081
1082 static struct nvmf_qpair *
tcp_allocate_qpair(struct nvmf_association * na,const struct nvmf_qpair_params * qparams)1083 tcp_allocate_qpair(struct nvmf_association *na,
1084 const struct nvmf_qpair_params *qparams)
1085 {
1086 const struct nvmf_association_params *aparams = &na->na_params;
1087 struct nvmf_tcp_qpair *qp;
1088 int error;
1089
1090 if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
1091 na_error(na, "Invalid PDA");
1092 return (NULL);
1093 }
1094
1095 qp = calloc(1, sizeof(*qp));
1096 qp->s = qparams->tcp.fd;
1097 LIST_INIT(&qp->rx_buffers);
1098 LIST_INIT(&qp->tx_buffers);
1099 TAILQ_INIT(&qp->rx_capsules);
1100 if (na->na_controller)
1101 error = tcp_accept(qp, na);
1102 else
1103 error = tcp_connect(qp, na, qparams->admin);
1104 if (error != 0) {
1105 free(qp);
1106 return (NULL);
1107 }
1108
1109 return (&qp->qp);
1110 }
1111
1112 static void
tcp_free_qpair(struct nvmf_qpair * nq)1113 tcp_free_qpair(struct nvmf_qpair *nq)
1114 {
1115 struct nvmf_tcp_qpair *qp = TQP(nq);
1116 struct nvmf_tcp_capsule *ntc, *tc;
1117 struct nvmf_tcp_command_buffer *ncb, *cb;
1118
1119 TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
1120 TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1121 nvmf_free_capsule(&tc->nc);
1122 }
1123 LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
1124 tcp_free_command_buffer(cb);
1125 }
1126 LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
1127 tcp_free_command_buffer(cb);
1128 }
1129 free(qp);
1130 }
1131
1132 static int
tcp_kernel_handoff_params(struct nvmf_qpair * nq,struct nvmf_handoff_qpair_params * qparams)1133 tcp_kernel_handoff_params(struct nvmf_qpair *nq,
1134 struct nvmf_handoff_qpair_params *qparams)
1135 {
1136 struct nvmf_tcp_qpair *qp = TQP(nq);
1137
1138 qparams->tcp.fd = qp->s;
1139 qparams->tcp.rxpda = qp->rxpda;
1140 qparams->tcp.txpda = qp->txpda;
1141 qparams->tcp.header_digests = qp->header_digests;
1142 qparams->tcp.data_digests = qp->data_digests;
1143 qparams->tcp.maxr2t = qp->maxr2t;
1144 qparams->tcp.maxh2cdata = qp->maxh2cdata;
1145 qparams->tcp.max_icd = qp->max_icd;
1146
1147 return (0);
1148 }
1149
1150 static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * qp __unused)1151 tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
1152 {
1153 struct nvmf_tcp_capsule *nc;
1154
1155 nc = calloc(1, sizeof(*nc));
1156 return (&nc->nc);
1157 }
1158
1159 static void
tcp_free_capsule(struct nvmf_capsule * nc)1160 tcp_free_capsule(struct nvmf_capsule *nc)
1161 {
1162 struct nvmf_tcp_capsule *tc = TCAP(nc);
1163
1164 nvmf_tcp_free_pdu(&tc->rx_pdu);
1165 if (tc->cb != NULL)
1166 tcp_free_command_buffer(tc->cb);
1167 free(tc);
1168 }
1169
1170 static int
tcp_transmit_command(struct nvmf_capsule * nc)1171 tcp_transmit_command(struct nvmf_capsule *nc)
1172 {
1173 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1174 struct nvmf_tcp_capsule *tc = TCAP(nc);
1175 struct nvme_tcp_cmd cmd;
1176 struct nvme_sgl_descriptor *sgl;
1177 int error;
1178 bool use_icd;
1179
1180 use_icd = false;
1181 if (nc->nc_data_len != 0 && nc->nc_send_data &&
1182 nc->nc_data_len <= qp->max_icd)
1183 use_icd = true;
1184
1185 memset(&cmd, 0, sizeof(cmd));
1186 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1187 cmd.ccsqe = nc->nc_sqe;
1188
1189 /* Populate SGL in SQE. */
1190 sgl = &cmd.ccsqe.sgl;
1191 memset(sgl, 0, sizeof(*sgl));
1192 sgl->address = 0;
1193 sgl->length = htole32(nc->nc_data_len);
1194 if (use_icd) {
1195 /* Use in-capsule data. */
1196 sgl->type = NVME_SGL_TYPE_ICD;
1197 } else {
1198 /* Use a command buffer. */
1199 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1200 }
1201
1202 /* Send command capsule. */
1203 error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
1204 nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
1205 if (error != 0)
1206 return (error);
1207
1208 /*
1209 * If data will be transferred using a command buffer, allocate a
1210 * buffer structure and queue it.
1211 */
1212 if (nc->nc_data_len != 0 && !use_icd)
1213 tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
1214 nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
1215
1216 return (0);
1217 }
1218
1219 static int
tcp_transmit_response(struct nvmf_capsule * nc)1220 tcp_transmit_response(struct nvmf_capsule *nc)
1221 {
1222 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1223 struct nvme_tcp_rsp rsp;
1224
1225 memset(&rsp, 0, sizeof(rsp));
1226 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1227 rsp.rccqe = nc->nc_cqe;
1228
1229 return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1230 }
1231
1232 static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1233 tcp_transmit_capsule(struct nvmf_capsule *nc)
1234 {
1235 if (nc->nc_qe_len == sizeof(struct nvme_command))
1236 return (tcp_transmit_command(nc));
1237 else
1238 return (tcp_transmit_response(nc));
1239 }
1240
1241 static int
tcp_receive_capsule(struct nvmf_qpair * nq,struct nvmf_capsule ** ncp)1242 tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
1243 {
1244 struct nvmf_tcp_qpair *qp = TQP(nq);
1245 struct nvmf_tcp_capsule *tc;
1246 int error;
1247
1248 while (TAILQ_EMPTY(&qp->rx_capsules)) {
1249 error = nvmf_tcp_receive_pdu(qp);
1250 if (error != 0)
1251 return (error);
1252 }
1253 tc = TAILQ_FIRST(&qp->rx_capsules);
1254 TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1255 *ncp = &tc->nc;
1256 return (0);
1257 }
1258
1259 static uint8_t
tcp_validate_command_capsule(const struct nvmf_capsule * nc)1260 tcp_validate_command_capsule(const struct nvmf_capsule *nc)
1261 {
1262 const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1263 const struct nvme_sgl_descriptor *sgl;
1264
1265 assert(tc->rx_pdu.hdr != NULL);
1266
1267 sgl = &nc->nc_sqe.sgl;
1268 switch (sgl->type) {
1269 case NVME_SGL_TYPE_ICD:
1270 if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1271 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1272 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1273 }
1274 break;
1275 case NVME_SGL_TYPE_COMMAND_BUFFER:
1276 if (tc->rx_pdu.data_len != 0) {
1277 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1278 return (NVME_SC_INVALID_FIELD);
1279 }
1280 break;
1281 default:
1282 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1283 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1284 }
1285
1286 if (sgl->address != 0) {
1287 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1288 return (NVME_SC_SGL_OFFSET_INVALID);
1289 }
1290
1291 return (NVME_SC_SUCCESS);
1292 }
1293
1294 static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1295 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1296 {
1297 assert(nc->nc_qe_len == sizeof(struct nvme_command));
1298 return (le32toh(nc->nc_sqe.sgl.length));
1299 }
1300
1301 /* NB: cid and ttag are both little-endian already. */
1302 static int
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1303 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
1304 uint32_t data_offset, uint32_t data_len)
1305 {
1306 struct nvme_tcp_r2t_hdr r2t;
1307
1308 memset(&r2t, 0, sizeof(r2t));
1309 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1310 r2t.cccid = cid;
1311 r2t.ttag = ttag;
1312 r2t.r2to = htole32(data_offset);
1313 r2t.r2tl = htole32(data_len);
1314
1315 return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
1316 }
1317
1318 static int
tcp_receive_r2t_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1319 tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1320 void *buf, size_t len)
1321 {
1322 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1323 struct nvmf_tcp_command_buffer *cb;
1324 int error;
1325 uint16_t ttag;
1326
1327 /*
1328 * Don't bother byte-swapping ttag as it is just a cookie
1329 * value returned by the other end as-is.
1330 */
1331 ttag = qp->next_ttag++;
1332
1333 error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
1334 if (error != 0)
1335 return (error);
1336
1337 cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
1338 nc->nc_sqe.cid, ttag, true);
1339
1340 /* Parse received PDUs until the data transfer is complete. */
1341 while (cb->data_xfered < cb->data_len) {
1342 error = nvmf_tcp_receive_pdu(qp);
1343 if (error != 0)
1344 break;
1345 }
1346 tcp_free_command_buffer(cb);
1347 return (error);
1348 }
1349
1350 static int
tcp_receive_icd_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1351 tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1352 void *buf, size_t len)
1353 {
1354 const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1355 const char *icd;
1356
1357 icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
1358 memcpy(buf, icd, len);
1359 return (0);
1360 }
1361
1362 static int
tcp_receive_controller_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1363 tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1364 void *buf, size_t len)
1365 {
1366 struct nvmf_association *na = nc->nc_qpair->nq_association;
1367 const struct nvme_sgl_descriptor *sgl;
1368 size_t data_len;
1369
1370 if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1371 return (EINVAL);
1372
1373 sgl = &nc->nc_sqe.sgl;
1374 data_len = le32toh(sgl->length);
1375 if (data_offset + len > data_len)
1376 return (EFBIG);
1377
1378 if (sgl->type == NVME_SGL_TYPE_ICD)
1379 return (tcp_receive_icd_data(nc, data_offset, buf, len));
1380 else
1381 return (tcp_receive_r2t_data(nc, data_offset, buf, len));
1382 }
1383
1384 /* NB: cid is little-endian already. */
1385 static int
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,const void * buf,size_t len,bool last_pdu,bool success)1386 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
1387 uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
1388 bool success)
1389 {
1390 struct nvme_tcp_c2h_data_hdr c2h;
1391
1392 memset(&c2h, 0, sizeof(c2h));
1393 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1394 if (last_pdu)
1395 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1396 if (success)
1397 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1398 c2h.cccid = cid;
1399 c2h.datao = htole32(data_offset);
1400 c2h.datal = htole32(len);
1401
1402 return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
1403 __DECONST(void *, buf), len));
1404 }
1405
1406 static int
tcp_send_controller_data(const struct nvmf_capsule * nc,const void * buf,size_t len)1407 tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
1408 size_t len)
1409 {
1410 struct nvmf_association *na = nc->nc_qpair->nq_association;
1411 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1412 const struct nvme_sgl_descriptor *sgl;
1413 const char *src;
1414 size_t todo;
1415 uint32_t data_len, data_offset;
1416 int error;
1417 bool last_pdu, send_success_flag;
1418
1419 if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1420 return (EINVAL);
1421
1422 sgl = &nc->nc_sqe.sgl;
1423 data_len = le32toh(sgl->length);
1424 if (len != data_len) {
1425 nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1426 return (EFBIG);
1427 }
1428
1429 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1430 nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1431 return (EINVAL);
1432 }
1433
1434 /* Use the SUCCESS flag if SQ flow control is disabled. */
1435 send_success_flag = !qp->qp.nq_flow_control;
1436
1437 /*
1438 * Write out one or more C2H_DATA PDUs containing the data.
1439 * Each PDU is arbitrarily capped at 256k.
1440 */
1441 data_offset = 0;
1442 src = buf;
1443 while (len > 0) {
1444 if (len > 256 * 1024) {
1445 todo = 256 * 1024;
1446 last_pdu = false;
1447 } else {
1448 todo = len;
1449 last_pdu = true;
1450 }
1451 error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
1452 src, todo, last_pdu, last_pdu && send_success_flag);
1453 if (error != 0) {
1454 nvmf_send_generic_error(nc,
1455 NVME_SC_TRANSIENT_TRANSPORT_ERROR);
1456 return (error);
1457 }
1458 data_offset += todo;
1459 src += todo;
1460 len -= todo;
1461 }
1462 if (!send_success_flag)
1463 nvmf_send_success(nc);
1464 return (0);
1465 }
1466
1467 struct nvmf_transport_ops tcp_ops = {
1468 .allocate_association = tcp_allocate_association,
1469 .update_association = tcp_update_association,
1470 .free_association = tcp_free_association,
1471 .allocate_qpair = tcp_allocate_qpair,
1472 .free_qpair = tcp_free_qpair,
1473 .kernel_handoff_params = tcp_kernel_handoff_params,
1474 .allocate_capsule = tcp_allocate_capsule,
1475 .free_capsule = tcp_free_capsule,
1476 .transmit_capsule = tcp_transmit_capsule,
1477 .receive_capsule = tcp_receive_capsule,
1478 .validate_command_capsule = tcp_validate_command_capsule,
1479 .capsule_data_len = tcp_capsule_data_len,
1480 .receive_controller_data = tcp_receive_controller_data,
1481 .send_controller_data = tcp_send_controller_data,
1482 };
1483