1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/endian.h>
9 #include <sys/gsb_crc32.h>
10 #include <sys/queue.h>
11 #include <sys/socket.h>
12 #include <sys/uio.h>
13 #include <assert.h>
14 #include <errno.h>
15 #include <netdb.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include <unistd.h>
20
21 #include "libnvmf.h"
22 #include "internal.h"
23 #include "nvmf_tcp.h"
24
25 struct nvmf_tcp_qpair;
26
27 struct nvmf_tcp_command_buffer {
28 struct nvmf_tcp_qpair *qp;
29
30 void *data;
31 size_t data_len;
32 size_t data_xfered;
33 uint32_t data_offset;
34
35 uint16_t cid;
36 uint16_t ttag;
37
38 LIST_ENTRY(nvmf_tcp_command_buffer) link;
39 };
40
41 LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
42
43 struct nvmf_tcp_association {
44 struct nvmf_association na;
45
46 uint32_t ioccsz;
47 };
48
49 struct nvmf_tcp_rxpdu {
50 struct nvme_tcp_common_pdu_hdr *hdr;
51 uint32_t data_len;
52 };
53
54 struct nvmf_tcp_capsule {
55 struct nvmf_capsule nc;
56
57 struct nvmf_tcp_rxpdu rx_pdu;
58 struct nvmf_tcp_command_buffer *cb;
59
60 TAILQ_ENTRY(nvmf_tcp_capsule) link;
61 };
62
63 struct nvmf_tcp_qpair {
64 struct nvmf_qpair qp;
65 int s;
66
67 uint8_t txpda;
68 uint8_t rxpda;
69 bool header_digests;
70 bool data_digests;
71 uint32_t maxr2t;
72 uint32_t maxh2cdata;
73 uint32_t max_icd; /* Host only */
74 uint16_t next_ttag; /* Controller only */
75
76 struct nvmf_tcp_command_buffer_list tx_buffers;
77 struct nvmf_tcp_command_buffer_list rx_buffers;
78 TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
79 };
80
81 #define TASSOC(nc) ((struct nvmf_tcp_association *)(na))
82 #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc))
83 #define CTCAP(nc) ((const struct nvmf_tcp_capsule *)(nc))
84 #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp))
85
86 static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
87
88 static uint32_t
compute_digest(const void * buf,size_t len)89 compute_digest(const void *buf, size_t len)
90 {
91 return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
92 }
93
94 static struct nvmf_tcp_command_buffer *
tcp_alloc_command_buffer(struct nvmf_tcp_qpair * qp,void * data,uint32_t data_offset,size_t data_len,uint16_t cid,uint16_t ttag,bool receive)95 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
96 uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
97 bool receive)
98 {
99 struct nvmf_tcp_command_buffer *cb;
100
101 cb = malloc(sizeof(*cb));
102 cb->qp = qp;
103 cb->data = data;
104 cb->data_offset = data_offset;
105 cb->data_len = data_len;
106 cb->data_xfered = 0;
107 cb->cid = cid;
108 cb->ttag = ttag;
109
110 if (receive)
111 LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
112 else
113 LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
114 return (cb);
115 }
116
117 static struct nvmf_tcp_command_buffer *
tcp_find_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)118 tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
119 bool receive)
120 {
121 struct nvmf_tcp_command_buffer_list *list;
122 struct nvmf_tcp_command_buffer *cb;
123
124 list = receive ? &qp->rx_buffers : &qp->tx_buffers;
125 LIST_FOREACH(cb, list, link) {
126 if (cb->cid == cid && cb->ttag == ttag)
127 return (cb);
128 }
129 return (NULL);
130 }
131
132 static void
tcp_purge_command_buffer(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,bool receive)133 tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
134 bool receive)
135 {
136 struct nvmf_tcp_command_buffer *cb;
137
138 cb = tcp_find_command_buffer(qp, cid, ttag, receive);
139 if (cb != NULL)
140 LIST_REMOVE(cb, link);
141 }
142
143 static void
tcp_free_command_buffer(struct nvmf_tcp_command_buffer * cb)144 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
145 {
146 LIST_REMOVE(cb, link);
147 free(cb);
148 }
149
150 static int
nvmf_tcp_write_pdu(struct nvmf_tcp_qpair * qp,const void * pdu,size_t len)151 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
152 {
153 ssize_t nwritten;
154 const char *cp;
155
156 cp = pdu;
157 while (len != 0) {
158 nwritten = write(qp->s, cp, len);
159 if (nwritten < 0)
160 return (errno);
161 len -= nwritten;
162 cp += nwritten;
163 }
164 return (0);
165 }
166
167 static int
nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair * qp,struct iovec * iov,u_int iovcnt,size_t len)168 nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
169 u_int iovcnt, size_t len)
170 {
171 ssize_t nwritten;
172
173 for (;;) {
174 nwritten = writev(qp->s, iov, iovcnt);
175 if (nwritten < 0)
176 return (errno);
177
178 len -= nwritten;
179 if (len == 0)
180 return (0);
181
182 while (iov->iov_len <= (size_t)nwritten) {
183 nwritten -= iov->iov_len;
184 iovcnt--;
185 iov++;
186 }
187
188 iov->iov_base = (char *)iov->iov_base + nwritten;
189 iov->iov_len -= nwritten;
190 }
191 }
192
193 static void
nvmf_tcp_report_error(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,uint16_t fes,uint32_t fei,const void * rx_pdu,size_t pdu_len,u_int hlen)194 nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
195 uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
196 {
197 struct nvme_tcp_term_req_hdr hdr;
198 struct iovec iov[2];
199
200 if (hlen != 0) {
201 if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
202 hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
203 if (hlen > pdu_len)
204 hlen = pdu_len;
205 }
206
207 memset(&hdr, 0, sizeof(hdr));
208 hdr.common.pdu_type = na->na_controller ?
209 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
210 hdr.common.hlen = sizeof(hdr);
211 hdr.common.plen = sizeof(hdr) + hlen;
212 hdr.fes = htole16(fes);
213 le32enc(hdr.fei, fei);
214 iov[0].iov_base = &hdr;
215 iov[0].iov_len = sizeof(hdr);
216 iov[1].iov_base = __DECONST(void *, rx_pdu);
217 iov[1].iov_len = hlen;
218
219 (void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
220 close(qp->s);
221 qp->s = -1;
222 }
223
224 static int
nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu,size_t pdu_len)225 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
226 size_t pdu_len)
227 {
228 const struct nvme_tcp_common_pdu_hdr *ch;
229 uint32_t data_len, fei, plen;
230 uint32_t digest, rx_digest;
231 u_int hlen;
232 int error;
233 uint16_t fes;
234
235 /* Determine how large of a PDU header to return for errors. */
236 ch = pdu->hdr;
237 hlen = ch->hlen;
238 plen = le32toh(ch->plen);
239 if (hlen < sizeof(*ch) || hlen > plen)
240 hlen = sizeof(*ch);
241
242 error = nvmf_tcp_validate_pdu_header(ch,
243 qp->qp.nq_association->na_controller, qp->header_digests,
244 qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
245 if (error != 0) {
246 if (error == ECONNRESET) {
247 close(qp->s);
248 qp->s = -1;
249 } else {
250 nvmf_tcp_report_error(qp->qp.nq_association, qp,
251 fes, fei, ch, pdu_len, hlen);
252 }
253 return (error);
254 }
255
256 /* Check header digest if present. */
257 if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
258 digest = compute_digest(ch, ch->hlen);
259 memcpy(&rx_digest, (const char *)ch + ch->hlen,
260 sizeof(rx_digest));
261 if (digest != rx_digest) {
262 printf("NVMe/TCP: Header digest mismatch\n");
263 nvmf_tcp_report_error(qp->qp.nq_association, qp,
264 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
265 pdu_len, hlen);
266 return (EBADMSG);
267 }
268 }
269
270 /* Check data digest if present. */
271 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
272 digest = compute_digest((const char *)ch + ch->pdo, data_len);
273 memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
274 sizeof(rx_digest));
275 if (digest != rx_digest) {
276 printf("NVMe/TCP: Data digest mismatch\n");
277 return (EBADMSG);
278 }
279 }
280
281 pdu->data_len = data_len;
282 return (0);
283 }
284
285 /*
286 * Read data from a socket, retrying until the data has been fully
287 * read or an error occurs.
288 */
289 static int
nvmf_tcp_read_buffer(int s,void * buf,size_t len)290 nvmf_tcp_read_buffer(int s, void *buf, size_t len)
291 {
292 ssize_t nread;
293 char *cp;
294
295 cp = buf;
296 while (len != 0) {
297 nread = read(s, cp, len);
298 if (nread < 0)
299 return (errno);
300 if (nread == 0)
301 return (ECONNRESET);
302 len -= nread;
303 cp += nread;
304 }
305 return (0);
306 }
307
308 static int
nvmf_tcp_read_pdu(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)309 nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
310 {
311 struct nvme_tcp_common_pdu_hdr ch;
312 uint32_t plen;
313 int error;
314
315 memset(pdu, 0, sizeof(*pdu));
316 error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
317 if (error != 0)
318 return (error);
319
320 plen = le32toh(ch.plen);
321
322 /*
323 * Validate a header with garbage lengths to trigger
324 * an error message without reading more.
325 */
326 if (plen < sizeof(ch) || ch.hlen > plen) {
327 pdu->hdr = &ch;
328 error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
329 pdu->hdr = NULL;
330 assert(error != 0);
331 return (error);
332 }
333
334 /* Read the rest of the PDU. */
335 pdu->hdr = malloc(plen);
336 memcpy(pdu->hdr, &ch, sizeof(ch));
337 error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
338 if (error != 0)
339 return (error);
340 error = nvmf_tcp_validate_pdu(qp, pdu, plen);
341 if (error != 0) {
342 free(pdu->hdr);
343 pdu->hdr = NULL;
344 }
345 return (error);
346 }
347
348 static void
nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu * pdu)349 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
350 {
351 free(pdu->hdr);
352 pdu->hdr = NULL;
353 }
354
355 static int
nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu * pdu)356 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
357 {
358 struct nvme_tcp_term_req_hdr *hdr;
359
360 hdr = (void *)pdu->hdr;
361
362 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
363 le16toh(hdr->fes), le32dec(hdr->fei));
364 nvmf_tcp_free_pdu(pdu);
365 return (ECONNRESET);
366 }
367
368 static int
nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)369 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
370 struct nvmf_tcp_rxpdu *pdu)
371 {
372 struct nvme_tcp_cmd *cmd;
373 struct nvmf_capsule *nc;
374 struct nvmf_tcp_capsule *tc;
375
376 cmd = (void *)pdu->hdr;
377
378 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
379 if (nc == NULL)
380 return (ENOMEM);
381
382 tc = TCAP(nc);
383 tc->rx_pdu = *pdu;
384
385 TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
386 return (0);
387 }
388
389 static int
nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)390 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
391 struct nvmf_tcp_rxpdu *pdu)
392 {
393 struct nvme_tcp_rsp *rsp;
394 struct nvmf_capsule *nc;
395 struct nvmf_tcp_capsule *tc;
396
397 rsp = (void *)pdu->hdr;
398
399 nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
400 if (nc == NULL)
401 return (ENOMEM);
402
403 nc->nc_sqhd_valid = true;
404 tc = TCAP(nc);
405 tc->rx_pdu = *pdu;
406
407 TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
408
409 /*
410 * Once the CQE has been received, no further transfers to the
411 * command buffer for the associated CID can occur.
412 */
413 tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
414 tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
415
416 return (0);
417 }
418
419 /*
420 * Construct and send a PDU that contains an optional data payload.
421 * This includes dealing with digests and the length fields in the
422 * common header.
423 */
424 static int
nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair * qp,void * hdr,size_t hlen,void * data,uint32_t data_len)425 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
426 void *data, uint32_t data_len)
427 {
428 struct nvme_tcp_common_pdu_hdr *ch;
429 struct iovec iov[5];
430 u_int iovcnt;
431 uint32_t header_digest, data_digest, pad, pdo, plen;
432
433 plen = hlen;
434 if (qp->header_digests)
435 plen += sizeof(header_digest);
436 if (data_len != 0) {
437 pdo = roundup(plen, qp->txpda);
438 pad = pdo - plen;
439 plen = pdo + data_len;
440 if (qp->data_digests)
441 plen += sizeof(data_digest);
442 } else {
443 assert(data == NULL);
444 pdo = 0;
445 pad = 0;
446 }
447
448 ch = hdr;
449 ch->hlen = hlen;
450 if (qp->header_digests)
451 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
452 if (qp->data_digests && data_len != 0)
453 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
454 ch->pdo = pdo;
455 ch->plen = htole32(plen);
456
457 /* CH + PSH */
458 iov[0].iov_base = hdr;
459 iov[0].iov_len = hlen;
460 iovcnt = 1;
461
462 /* HDGST */
463 if (qp->header_digests) {
464 header_digest = compute_digest(hdr, hlen);
465 iov[iovcnt].iov_base = &header_digest;
466 iov[iovcnt].iov_len = sizeof(header_digest);
467 iovcnt++;
468 }
469
470 if (pad != 0) {
471 /* PAD */
472 iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
473 iov[iovcnt].iov_len = pad;
474 iovcnt++;
475 }
476
477 if (data_len != 0) {
478 /* DATA */
479 iov[iovcnt].iov_base = data;
480 iov[iovcnt].iov_len = data_len;
481 iovcnt++;
482
483 /* DDGST */
484 if (qp->data_digests) {
485 data_digest = compute_digest(data, data_len);
486 iov[iovcnt].iov_base = &data_digest;
487 iov[iovcnt].iov_len = sizeof(data_digest);
488 iovcnt++;
489 }
490 }
491
492 return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
493 }
494
495 static int
nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)496 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
497 {
498 struct nvme_tcp_h2c_data_hdr *h2c;
499 struct nvmf_tcp_command_buffer *cb;
500 uint32_t data_len, data_offset;
501 const char *icd;
502
503 h2c = (void *)pdu->hdr;
504 if (le32toh(h2c->datal) > qp->maxh2cdata) {
505 nvmf_tcp_report_error(qp->qp.nq_association, qp,
506 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
507 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
508 nvmf_tcp_free_pdu(pdu);
509 return (EBADMSG);
510 }
511
512 cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
513 if (cb == NULL) {
514 nvmf_tcp_report_error(qp->qp.nq_association, qp,
515 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
516 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
517 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
518 nvmf_tcp_free_pdu(pdu);
519 return (EBADMSG);
520 }
521
522 data_len = le32toh(h2c->datal);
523 if (data_len != pdu->data_len) {
524 nvmf_tcp_report_error(qp->qp.nq_association, qp,
525 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
526 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
527 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
528 nvmf_tcp_free_pdu(pdu);
529 return (EBADMSG);
530 }
531
532 data_offset = le32toh(h2c->datao);
533 if (data_offset < cb->data_offset ||
534 data_offset + data_len > cb->data_offset + cb->data_len) {
535 nvmf_tcp_report_error(qp->qp.nq_association, qp,
536 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
537 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
538 nvmf_tcp_free_pdu(pdu);
539 return (EBADMSG);
540 }
541
542 if (data_offset != cb->data_offset + cb->data_xfered) {
543 nvmf_tcp_report_error(qp->qp.nq_association, qp,
544 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
545 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
546 nvmf_tcp_free_pdu(pdu);
547 return (EBADMSG);
548 }
549
550 if ((cb->data_xfered + data_len == cb->data_len) !=
551 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
552 nvmf_tcp_report_error(qp->qp.nq_association, qp,
553 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
554 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
555 nvmf_tcp_free_pdu(pdu);
556 return (EBADMSG);
557 }
558
559 cb->data_xfered += data_len;
560 data_offset -= cb->data_offset;
561 icd = (const char *)pdu->hdr + pdu->hdr->pdo;
562 memcpy((char *)cb->data + data_offset, icd, data_len);
563
564 nvmf_tcp_free_pdu(pdu);
565 return (0);
566 }
567
568 static int
nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)569 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
570 {
571 struct nvme_tcp_c2h_data_hdr *c2h;
572 struct nvmf_tcp_command_buffer *cb;
573 uint32_t data_len, data_offset;
574 const char *icd;
575
576 c2h = (void *)pdu->hdr;
577
578 cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
579 if (cb == NULL) {
580 /*
581 * XXX: Could be PDU sequence error if cccid is for a
582 * command that doesn't use a command buffer.
583 */
584 nvmf_tcp_report_error(qp->qp.nq_association, qp,
585 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
586 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
587 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
588 nvmf_tcp_free_pdu(pdu);
589 return (EBADMSG);
590 }
591
592 data_len = le32toh(c2h->datal);
593 if (data_len != pdu->data_len) {
594 nvmf_tcp_report_error(qp->qp.nq_association, qp,
595 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
596 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
597 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
598 nvmf_tcp_free_pdu(pdu);
599 return (EBADMSG);
600 }
601
602 data_offset = le32toh(c2h->datao);
603 if (data_offset < cb->data_offset ||
604 data_offset + data_len > cb->data_offset + cb->data_len) {
605 nvmf_tcp_report_error(qp->qp.nq_association, qp,
606 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
607 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
608 nvmf_tcp_free_pdu(pdu);
609 return (EBADMSG);
610 }
611
612 if (data_offset != cb->data_offset + cb->data_xfered) {
613 nvmf_tcp_report_error(qp->qp.nq_association, qp,
614 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
615 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
616 nvmf_tcp_free_pdu(pdu);
617 return (EBADMSG);
618 }
619
620 if ((cb->data_xfered + data_len == cb->data_len) !=
621 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
622 nvmf_tcp_report_error(qp->qp.nq_association, qp,
623 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
624 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
625 nvmf_tcp_free_pdu(pdu);
626 return (EBADMSG);
627 }
628
629 cb->data_xfered += data_len;
630 data_offset -= cb->data_offset;
631 icd = (const char *)pdu->hdr + pdu->hdr->pdo;
632 memcpy((char *)cb->data + data_offset, icd, data_len);
633
634 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
635 struct nvme_completion cqe;
636 struct nvmf_tcp_capsule *tc;
637 struct nvmf_capsule *nc;
638
639 memset(&cqe, 0, sizeof(cqe));
640 cqe.cid = cb->cid;
641
642 nc = nvmf_allocate_response(&qp->qp, &cqe);
643 if (nc == NULL) {
644 nvmf_tcp_free_pdu(pdu);
645 return (ENOMEM);
646 }
647 nc->nc_sqhd_valid = false;
648
649 tc = TCAP(nc);
650 TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
651 }
652
653 nvmf_tcp_free_pdu(pdu);
654 return (0);
655 }
656
657 /* NB: cid and ttag and little-endian already. */
658 static int
tcp_send_h2c_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)659 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
660 uint32_t data_offset, void *buf, size_t len, bool last_pdu)
661 {
662 struct nvme_tcp_h2c_data_hdr h2c;
663
664 memset(&h2c, 0, sizeof(h2c));
665 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
666 if (last_pdu)
667 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
668 h2c.cccid = cid;
669 h2c.ttag = ttag;
670 h2c.datao = htole32(data_offset);
671 h2c.datal = htole32(len);
672
673 return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
674 }
675
676 /* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
677 static int
tcp_send_h2c_pdus(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,void * buf,size_t len,bool last_pdu)678 tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
679 uint32_t data_offset, void *buf, size_t len, bool last_pdu)
680 {
681 char *p;
682
683 p = buf;
684 while (len != 0) {
685 size_t todo;
686 int error;
687
688 todo = len;
689 if (todo > qp->maxh2cdata)
690 todo = qp->maxh2cdata;
691 error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
692 last_pdu && todo == len);
693 if (error != 0)
694 return (error);
695 p += todo;
696 len -= todo;
697 }
698 return (0);
699 }
700
701 static int
nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair * qp,struct nvmf_tcp_rxpdu * pdu)702 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
703 {
704 struct nvmf_tcp_command_buffer *cb;
705 struct nvme_tcp_r2t_hdr *r2t;
706 uint32_t data_len, data_offset;
707 int error;
708
709 r2t = (void *)pdu->hdr;
710
711 cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
712 if (cb == NULL) {
713 nvmf_tcp_report_error(qp->qp.nq_association, qp,
714 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
715 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
716 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
717 nvmf_tcp_free_pdu(pdu);
718 return (EBADMSG);
719 }
720
721 data_offset = le32toh(r2t->r2to);
722 if (data_offset != cb->data_xfered) {
723 nvmf_tcp_report_error(qp->qp.nq_association, qp,
724 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
725 le32toh(pdu->hdr->plen), pdu->hdr->hlen);
726 nvmf_tcp_free_pdu(pdu);
727 return (EBADMSG);
728 }
729
730 /*
731 * XXX: The spec does not specify how to handle R2T tranfers
732 * out of range of the original command.
733 */
734 data_len = le32toh(r2t->r2tl);
735 if (data_offset + data_len > cb->data_len) {
736 nvmf_tcp_report_error(qp->qp.nq_association, qp,
737 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
738 pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
739 nvmf_tcp_free_pdu(pdu);
740 return (EBADMSG);
741 }
742
743 cb->data_xfered += data_len;
744
745 /*
746 * Write out one or more H2C_DATA PDUs containing the
747 * requested data.
748 */
749 error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
750 data_offset, (char *)cb->data + data_offset, data_len, true);
751
752 nvmf_tcp_free_pdu(pdu);
753 return (error);
754 }
755
756 static int
nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair * qp)757 nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
758 {
759 struct nvmf_tcp_rxpdu pdu;
760 int error;
761
762 error = nvmf_tcp_read_pdu(qp, &pdu);
763 if (error != 0)
764 return (error);
765
766 switch (pdu.hdr->pdu_type) {
767 default:
768 __unreachable();
769 break;
770 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
771 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
772 return (nvmf_tcp_handle_term_req(&pdu));
773 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
774 return (nvmf_tcp_save_command_capsule(qp, &pdu));
775 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
776 return (nvmf_tcp_save_response_capsule(qp, &pdu));
777 case NVME_TCP_PDU_TYPE_H2C_DATA:
778 return (nvmf_tcp_handle_h2c_data(qp, &pdu));
779 case NVME_TCP_PDU_TYPE_C2H_DATA:
780 return (nvmf_tcp_handle_c2h_data(qp, &pdu));
781 case NVME_TCP_PDU_TYPE_R2T:
782 return (nvmf_tcp_handle_r2t(qp, &pdu));
783 }
784 }
785
786 static bool
nvmf_tcp_validate_ic_pdu(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,const struct nvme_tcp_common_pdu_hdr * ch,size_t pdu_len)787 nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
788 const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
789 {
790 const struct nvme_tcp_ic_req *pdu;
791 uint32_t plen;
792 u_int hlen;
793
794 /* Determine how large of a PDU header to return for errors. */
795 hlen = ch->hlen;
796 plen = le32toh(ch->plen);
797 if (hlen < sizeof(*ch) || hlen > plen)
798 hlen = sizeof(*ch);
799
800 /*
801 * Errors must be reported for the lowest incorrect field
802 * first, so validate fields in order.
803 */
804
805 /* Validate pdu_type. */
806
807 /* Controllers only receive PDUs with a PDU direction of 0. */
808 if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) {
809 na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
810 nvmf_tcp_report_error(na, qp,
811 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
812 hlen);
813 return (false);
814 }
815
816 switch (ch->pdu_type) {
817 case NVME_TCP_PDU_TYPE_IC_REQ:
818 case NVME_TCP_PDU_TYPE_IC_RESP:
819 break;
820 default:
821 na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
822 nvmf_tcp_report_error(na, qp,
823 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
824 hlen);
825 return (false);
826 }
827
828 /* Validate flags. */
829 if (ch->flags != 0) {
830 na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
831 ch->flags);
832 nvmf_tcp_report_error(na, qp,
833 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
834 hlen);
835 return (false);
836 }
837
838 /* Validate hlen. */
839 if (ch->hlen != 128) {
840 na_error(na, "NVMe/TCP: Invalid PDU header length %u",
841 ch->hlen);
842 nvmf_tcp_report_error(na, qp,
843 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
844 hlen);
845 return (false);
846 }
847
848 /* Validate pdo. */
849 if (ch->pdo != 0) {
850 na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
851 nvmf_tcp_report_error(na, qp,
852 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
853 hlen);
854 return (false);
855 }
856
857 /* Validate plen. */
858 if (plen != 128) {
859 na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
860 nvmf_tcp_report_error(na, qp,
861 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
862 hlen);
863 return (false);
864 }
865
866 /* Validate fields common to both ICReq and ICResp. */
867 pdu = (const struct nvme_tcp_ic_req *)ch;
868 if (le16toh(pdu->pfv) != 0) {
869 na_error(na, "NVMe/TCP: Unsupported PDU version %u",
870 le16toh(pdu->pfv));
871 nvmf_tcp_report_error(na, qp,
872 NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
873 8, ch, pdu_len, hlen);
874 return (false);
875 }
876
877 if (pdu->hpda > NVME_TCP_HPDA_MAX) {
878 na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
879 nvmf_tcp_report_error(na, qp,
880 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
881 hlen);
882 return (false);
883 }
884
885 if (pdu->dgst.bits.reserved != 0) {
886 na_error(na, "NVMe/TCP: Invalid digest settings");
887 nvmf_tcp_report_error(na, qp,
888 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
889 hlen);
890 return (false);
891 }
892
893 return (true);
894 }
895
896 static bool
nvmf_tcp_read_ic_req(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_req * pdu)897 nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
898 struct nvme_tcp_ic_req *pdu)
899 {
900 int error;
901
902 error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
903 if (error != 0) {
904 na_error(na, "NVMe/TCP: Failed to read IC request: %s",
905 strerror(error));
906 return (false);
907 }
908
909 return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
910 }
911
912 static bool
nvmf_tcp_read_ic_resp(struct nvmf_association * na,struct nvmf_tcp_qpair * qp,struct nvme_tcp_ic_resp * pdu)913 nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
914 struct nvme_tcp_ic_resp *pdu)
915 {
916 int error;
917
918 error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
919 if (error != 0) {
920 na_error(na, "NVMe/TCP: Failed to read IC response: %s",
921 strerror(error));
922 return (false);
923 }
924
925 return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
926 }
927
928 static struct nvmf_association *
tcp_allocate_association(bool controller,const struct nvmf_association_params * params)929 tcp_allocate_association(bool controller,
930 const struct nvmf_association_params *params)
931 {
932 struct nvmf_tcp_association *ta;
933
934 if (controller) {
935 /* 7.4.10.3 */
936 if (params->tcp.maxh2cdata < 4096 ||
937 params->tcp.maxh2cdata % 4 != 0)
938 return (NULL);
939 }
940
941 ta = calloc(1, sizeof(*ta));
942
943 return (&ta->na);
944 }
945
946 static void
tcp_update_association(struct nvmf_association * na,const struct nvme_controller_data * cdata)947 tcp_update_association(struct nvmf_association *na,
948 const struct nvme_controller_data *cdata)
949 {
950 struct nvmf_tcp_association *ta = TASSOC(na);
951
952 ta->ioccsz = le32toh(cdata->ioccsz);
953 }
954
955 static void
tcp_free_association(struct nvmf_association * na)956 tcp_free_association(struct nvmf_association *na)
957 {
958 free(na);
959 }
960
961 static bool
tcp_connect(struct nvmf_tcp_qpair * qp,struct nvmf_association * na,bool admin)962 tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
963 {
964 const struct nvmf_association_params *params = &na->na_params;
965 struct nvmf_tcp_association *ta = TASSOC(na);
966 struct nvme_tcp_ic_req ic_req;
967 struct nvme_tcp_ic_resp ic_resp;
968 uint32_t maxh2cdata;
969 int error;
970
971 if (!admin) {
972 if (ta->ioccsz == 0) {
973 na_error(na, "TCP I/O queues require cdata");
974 return (false);
975 }
976 if (ta->ioccsz < 4) {
977 na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
978 return (false);
979 }
980 }
981
982 memset(&ic_req, 0, sizeof(ic_req));
983 ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
984 ic_req.common.hlen = sizeof(ic_req);
985 ic_req.common.plen = htole32(sizeof(ic_req));
986 ic_req.pfv = htole16(0);
987 ic_req.hpda = params->tcp.pda;
988 if (params->tcp.header_digests)
989 ic_req.dgst.bits.hdgst_enable = 1;
990 if (params->tcp.data_digests)
991 ic_req.dgst.bits.ddgst_enable = 1;
992 ic_req.maxr2t = htole32(params->tcp.maxr2t);
993
994 error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
995 if (error != 0) {
996 na_error(na, "Failed to write IC request: %s", strerror(error));
997 return (false);
998 }
999
1000 if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
1001 return (false);
1002
1003 /* Ensure the controller didn't enable digests we didn't request. */
1004 if ((!params->tcp.header_digests &&
1005 ic_resp.dgst.bits.hdgst_enable != 0) ||
1006 (!params->tcp.data_digests &&
1007 ic_resp.dgst.bits.ddgst_enable != 0)) {
1008 na_error(na, "Controller enabled unrequested digests");
1009 nvmf_tcp_report_error(na, qp,
1010 NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
1011 11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
1012 return (false);
1013 }
1014
1015 /*
1016 * XXX: Is there an upper-bound to enforce here? Perhaps pick
1017 * some large value and report larger values as an unsupported
1018 * parameter?
1019 */
1020 maxh2cdata = le32toh(ic_resp.maxh2cdata);
1021 if (maxh2cdata < 4096 || maxh2cdata % 4 != 0) {
1022 na_error(na, "Invalid MAXH2CDATA %u", maxh2cdata);
1023 nvmf_tcp_report_error(na, qp,
1024 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
1025 sizeof(ic_resp), sizeof(ic_resp));
1026 return (false);
1027 }
1028
1029 qp->rxpda = (params->tcp.pda + 1) * 4;
1030 qp->txpda = (ic_resp.cpda + 1) * 4;
1031 qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1032 qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1033 qp->maxr2t = params->tcp.maxr2t;
1034 qp->maxh2cdata = maxh2cdata;
1035 if (admin)
1036 /* 7.4.3 */
1037 qp->max_icd = 8192;
1038 else
1039 qp->max_icd = (ta->ioccsz - 4) * 16;
1040
1041 return (0);
1042 }
1043
1044 static bool
tcp_accept(struct nvmf_tcp_qpair * qp,struct nvmf_association * na)1045 tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
1046 {
1047 const struct nvmf_association_params *params = &na->na_params;
1048 struct nvme_tcp_ic_req ic_req;
1049 struct nvme_tcp_ic_resp ic_resp;
1050 int error;
1051
1052 if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
1053 return (false);
1054
1055 memset(&ic_resp, 0, sizeof(ic_resp));
1056 ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
1057 ic_resp.common.hlen = sizeof(ic_req);
1058 ic_resp.common.plen = htole32(sizeof(ic_req));
1059 ic_resp.pfv = htole16(0);
1060 ic_resp.cpda = params->tcp.pda;
1061 if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
1062 ic_resp.dgst.bits.hdgst_enable = 1;
1063 if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
1064 ic_resp.dgst.bits.ddgst_enable = 1;
1065 ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
1066
1067 error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
1068 if (error != 0) {
1069 na_error(na, "Failed to write IC response: %s",
1070 strerror(error));
1071 return (false);
1072 }
1073
1074 qp->rxpda = (params->tcp.pda + 1) * 4;
1075 qp->txpda = (ic_req.hpda + 1) * 4;
1076 qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1077 qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1078 qp->maxr2t = le32toh(ic_req.maxr2t);
1079 qp->maxh2cdata = params->tcp.maxh2cdata;
1080 qp->max_icd = 0; /* XXX */
1081 return (0);
1082 }
1083
1084 static struct nvmf_qpair *
tcp_allocate_qpair(struct nvmf_association * na,const struct nvmf_qpair_params * qparams)1085 tcp_allocate_qpair(struct nvmf_association *na,
1086 const struct nvmf_qpair_params *qparams)
1087 {
1088 const struct nvmf_association_params *aparams = &na->na_params;
1089 struct nvmf_tcp_qpair *qp;
1090 int error;
1091
1092 if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
1093 na_error(na, "Invalid PDA");
1094 return (NULL);
1095 }
1096
1097 qp = calloc(1, sizeof(*qp));
1098 qp->s = qparams->tcp.fd;
1099 LIST_INIT(&qp->rx_buffers);
1100 LIST_INIT(&qp->tx_buffers);
1101 TAILQ_INIT(&qp->rx_capsules);
1102 if (na->na_controller)
1103 error = tcp_accept(qp, na);
1104 else
1105 error = tcp_connect(qp, na, qparams->admin);
1106 if (error != 0) {
1107 free(qp);
1108 return (NULL);
1109 }
1110
1111 return (&qp->qp);
1112 }
1113
1114 static void
tcp_free_qpair(struct nvmf_qpair * nq)1115 tcp_free_qpair(struct nvmf_qpair *nq)
1116 {
1117 struct nvmf_tcp_qpair *qp = TQP(nq);
1118 struct nvmf_tcp_capsule *ntc, *tc;
1119 struct nvmf_tcp_command_buffer *ncb, *cb;
1120
1121 TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
1122 TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1123 nvmf_free_capsule(&tc->nc);
1124 }
1125 LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
1126 tcp_free_command_buffer(cb);
1127 }
1128 LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
1129 tcp_free_command_buffer(cb);
1130 }
1131 free(qp);
1132 }
1133
1134 static void
tcp_kernel_handoff_params(struct nvmf_qpair * nq,nvlist_t * nvl)1135 tcp_kernel_handoff_params(struct nvmf_qpair *nq, nvlist_t *nvl)
1136 {
1137 struct nvmf_tcp_qpair *qp = TQP(nq);
1138
1139 nvlist_add_number(nvl, "fd", qp->s);
1140 nvlist_add_number(nvl, "rxpda", qp->rxpda);
1141 nvlist_add_number(nvl, "txpda", qp->txpda);
1142 nvlist_add_bool(nvl, "header_digests", qp->header_digests);
1143 nvlist_add_bool(nvl, "data_digests", qp->data_digests);
1144 nvlist_add_number(nvl, "maxr2t", qp->maxr2t);
1145 nvlist_add_number(nvl, "maxh2cdata", qp->maxh2cdata);
1146 nvlist_add_number(nvl, "max_icd", qp->max_icd);
1147 }
1148
1149 static int
tcp_populate_dle(struct nvmf_qpair * nq,struct nvme_discovery_log_entry * dle)1150 tcp_populate_dle(struct nvmf_qpair *nq, struct nvme_discovery_log_entry *dle)
1151 {
1152 struct nvmf_tcp_qpair *qp = TQP(nq);
1153 struct sockaddr_storage ss;
1154 socklen_t ss_len;
1155
1156 ss_len = sizeof(ss);
1157 if (getpeername(qp->s, (struct sockaddr *)&ss, &ss_len) == -1)
1158 return (errno);
1159
1160 if (getnameinfo((struct sockaddr *)&ss, ss_len, dle->traddr,
1161 sizeof(dle->traddr), dle->trsvcid, sizeof(dle->trsvcid),
1162 NI_NUMERICHOST | NI_NUMERICSERV) != 0)
1163 return (EINVAL);
1164
1165 return (0);
1166 }
1167
1168 static struct nvmf_capsule *
tcp_allocate_capsule(struct nvmf_qpair * qp __unused)1169 tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
1170 {
1171 struct nvmf_tcp_capsule *nc;
1172
1173 nc = calloc(1, sizeof(*nc));
1174 return (&nc->nc);
1175 }
1176
1177 static void
tcp_free_capsule(struct nvmf_capsule * nc)1178 tcp_free_capsule(struct nvmf_capsule *nc)
1179 {
1180 struct nvmf_tcp_capsule *tc = TCAP(nc);
1181
1182 nvmf_tcp_free_pdu(&tc->rx_pdu);
1183 if (tc->cb != NULL)
1184 tcp_free_command_buffer(tc->cb);
1185 free(tc);
1186 }
1187
1188 static int
tcp_transmit_command(struct nvmf_capsule * nc)1189 tcp_transmit_command(struct nvmf_capsule *nc)
1190 {
1191 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1192 struct nvmf_tcp_capsule *tc = TCAP(nc);
1193 struct nvme_tcp_cmd cmd;
1194 struct nvme_sgl_descriptor *sgl;
1195 int error;
1196 bool use_icd;
1197
1198 use_icd = false;
1199 if (nc->nc_data_len != 0 && nc->nc_send_data &&
1200 nc->nc_data_len <= qp->max_icd)
1201 use_icd = true;
1202
1203 memset(&cmd, 0, sizeof(cmd));
1204 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1205 cmd.ccsqe = nc->nc_sqe;
1206
1207 /* Populate SGL in SQE. */
1208 sgl = &cmd.ccsqe.sgl;
1209 memset(sgl, 0, sizeof(*sgl));
1210 sgl->address = 0;
1211 sgl->length = htole32(nc->nc_data_len);
1212 if (use_icd) {
1213 /* Use in-capsule data. */
1214 sgl->type = NVME_SGL_TYPE_ICD;
1215 } else {
1216 /* Use a command buffer. */
1217 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1218 }
1219
1220 /* Send command capsule. */
1221 error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
1222 nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
1223 if (error != 0)
1224 return (error);
1225
1226 /*
1227 * If data will be transferred using a command buffer, allocate a
1228 * buffer structure and queue it.
1229 */
1230 if (nc->nc_data_len != 0 && !use_icd)
1231 tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
1232 nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
1233
1234 return (0);
1235 }
1236
1237 static int
tcp_transmit_response(struct nvmf_capsule * nc)1238 tcp_transmit_response(struct nvmf_capsule *nc)
1239 {
1240 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1241 struct nvme_tcp_rsp rsp;
1242
1243 memset(&rsp, 0, sizeof(rsp));
1244 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1245 rsp.rccqe = nc->nc_cqe;
1246
1247 return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1248 }
1249
1250 static int
tcp_transmit_capsule(struct nvmf_capsule * nc)1251 tcp_transmit_capsule(struct nvmf_capsule *nc)
1252 {
1253 if (nc->nc_qe_len == sizeof(struct nvme_command))
1254 return (tcp_transmit_command(nc));
1255 else
1256 return (tcp_transmit_response(nc));
1257 }
1258
1259 static int
tcp_receive_capsule(struct nvmf_qpair * nq,struct nvmf_capsule ** ncp)1260 tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
1261 {
1262 struct nvmf_tcp_qpair *qp = TQP(nq);
1263 struct nvmf_tcp_capsule *tc;
1264 int error;
1265
1266 while (TAILQ_EMPTY(&qp->rx_capsules)) {
1267 error = nvmf_tcp_receive_pdu(qp);
1268 if (error != 0)
1269 return (error);
1270 }
1271 tc = TAILQ_FIRST(&qp->rx_capsules);
1272 TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1273 *ncp = &tc->nc;
1274 return (0);
1275 }
1276
1277 static uint8_t
tcp_validate_command_capsule(const struct nvmf_capsule * nc)1278 tcp_validate_command_capsule(const struct nvmf_capsule *nc)
1279 {
1280 const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1281 const struct nvme_sgl_descriptor *sgl;
1282
1283 assert(tc->rx_pdu.hdr != NULL);
1284
1285 sgl = &nc->nc_sqe.sgl;
1286 switch (sgl->type) {
1287 case NVME_SGL_TYPE_ICD:
1288 if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1289 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1290 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1291 }
1292 break;
1293 case NVME_SGL_TYPE_COMMAND_BUFFER:
1294 if (tc->rx_pdu.data_len != 0) {
1295 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1296 return (NVME_SC_INVALID_FIELD);
1297 }
1298 break;
1299 default:
1300 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1301 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1302 }
1303
1304 if (sgl->address != 0) {
1305 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1306 return (NVME_SC_SGL_OFFSET_INVALID);
1307 }
1308
1309 return (NVME_SC_SUCCESS);
1310 }
1311
1312 static size_t
tcp_capsule_data_len(const struct nvmf_capsule * nc)1313 tcp_capsule_data_len(const struct nvmf_capsule *nc)
1314 {
1315 assert(nc->nc_qe_len == sizeof(struct nvme_command));
1316 return (le32toh(nc->nc_sqe.sgl.length));
1317 }
1318
1319 /* NB: cid and ttag are both little-endian already. */
1320 static int
tcp_send_r2t(struct nvmf_tcp_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1321 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
1322 uint32_t data_offset, uint32_t data_len)
1323 {
1324 struct nvme_tcp_r2t_hdr r2t;
1325
1326 memset(&r2t, 0, sizeof(r2t));
1327 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1328 r2t.cccid = cid;
1329 r2t.ttag = ttag;
1330 r2t.r2to = htole32(data_offset);
1331 r2t.r2tl = htole32(data_len);
1332
1333 return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
1334 }
1335
1336 static int
tcp_receive_r2t_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1337 tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1338 void *buf, size_t len)
1339 {
1340 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1341 struct nvmf_tcp_command_buffer *cb;
1342 int error;
1343 uint16_t ttag;
1344
1345 /*
1346 * Don't bother byte-swapping ttag as it is just a cookie
1347 * value returned by the other end as-is.
1348 */
1349 ttag = qp->next_ttag++;
1350
1351 error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
1352 if (error != 0)
1353 return (error);
1354
1355 cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
1356 nc->nc_sqe.cid, ttag, true);
1357
1358 /* Parse received PDUs until the data transfer is complete. */
1359 while (cb->data_xfered < cb->data_len) {
1360 error = nvmf_tcp_receive_pdu(qp);
1361 if (error != 0)
1362 break;
1363 }
1364 tcp_free_command_buffer(cb);
1365 return (error);
1366 }
1367
1368 static int
tcp_receive_icd_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1369 tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1370 void *buf, size_t len)
1371 {
1372 const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1373 const char *icd;
1374
1375 icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
1376 memcpy(buf, icd, len);
1377 return (0);
1378 }
1379
1380 static int
tcp_receive_controller_data(const struct nvmf_capsule * nc,uint32_t data_offset,void * buf,size_t len)1381 tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1382 void *buf, size_t len)
1383 {
1384 struct nvmf_association *na = nc->nc_qpair->nq_association;
1385 const struct nvme_sgl_descriptor *sgl;
1386 size_t data_len;
1387
1388 if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1389 return (EINVAL);
1390
1391 sgl = &nc->nc_sqe.sgl;
1392 data_len = le32toh(sgl->length);
1393 if (data_offset + len > data_len)
1394 return (EFBIG);
1395
1396 if (sgl->type == NVME_SGL_TYPE_ICD)
1397 return (tcp_receive_icd_data(nc, data_offset, buf, len));
1398 else
1399 return (tcp_receive_r2t_data(nc, data_offset, buf, len));
1400 }
1401
1402 /* NB: cid is little-endian already. */
1403 static int
tcp_send_c2h_pdu(struct nvmf_tcp_qpair * qp,uint16_t cid,uint32_t data_offset,const void * buf,size_t len,bool last_pdu,bool success)1404 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
1405 uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
1406 bool success)
1407 {
1408 struct nvme_tcp_c2h_data_hdr c2h;
1409
1410 memset(&c2h, 0, sizeof(c2h));
1411 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1412 if (last_pdu)
1413 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1414 if (success)
1415 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1416 c2h.cccid = cid;
1417 c2h.datao = htole32(data_offset);
1418 c2h.datal = htole32(len);
1419
1420 return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
1421 __DECONST(void *, buf), len));
1422 }
1423
1424 static int
tcp_send_controller_data(const struct nvmf_capsule * nc,const void * buf,size_t len)1425 tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
1426 size_t len)
1427 {
1428 struct nvmf_association *na = nc->nc_qpair->nq_association;
1429 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1430 const struct nvme_sgl_descriptor *sgl;
1431 const char *src;
1432 size_t todo;
1433 uint32_t data_len, data_offset;
1434 int error;
1435 bool last_pdu, send_success_flag;
1436
1437 if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1438 return (EINVAL);
1439
1440 sgl = &nc->nc_sqe.sgl;
1441 data_len = le32toh(sgl->length);
1442 if (len != data_len) {
1443 nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1444 return (EFBIG);
1445 }
1446
1447 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1448 nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1449 return (EINVAL);
1450 }
1451
1452 /* Use the SUCCESS flag if SQ flow control is disabled. */
1453 send_success_flag = !qp->qp.nq_flow_control;
1454
1455 /*
1456 * Write out one or more C2H_DATA PDUs containing the data.
1457 * Each PDU is arbitrarily capped at 256k.
1458 */
1459 data_offset = 0;
1460 src = buf;
1461 while (len > 0) {
1462 if (len > 256 * 1024) {
1463 todo = 256 * 1024;
1464 last_pdu = false;
1465 } else {
1466 todo = len;
1467 last_pdu = true;
1468 }
1469 error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
1470 src, todo, last_pdu, last_pdu && send_success_flag);
1471 if (error != 0) {
1472 nvmf_send_generic_error(nc,
1473 NVME_SC_TRANSIENT_TRANSPORT_ERROR);
1474 return (error);
1475 }
1476 data_offset += todo;
1477 src += todo;
1478 len -= todo;
1479 }
1480 if (!send_success_flag)
1481 nvmf_send_success(nc);
1482 return (0);
1483 }
1484
1485 struct nvmf_transport_ops tcp_ops = {
1486 .allocate_association = tcp_allocate_association,
1487 .update_association = tcp_update_association,
1488 .free_association = tcp_free_association,
1489 .allocate_qpair = tcp_allocate_qpair,
1490 .free_qpair = tcp_free_qpair,
1491 .kernel_handoff_params = tcp_kernel_handoff_params,
1492 .populate_dle = tcp_populate_dle,
1493 .allocate_capsule = tcp_allocate_capsule,
1494 .free_capsule = tcp_free_capsule,
1495 .transmit_capsule = tcp_transmit_capsule,
1496 .receive_capsule = tcp_receive_capsule,
1497 .validate_command_capsule = tcp_validate_command_capsule,
1498 .capsule_data_len = tcp_capsule_data_len,
1499 .receive_controller_data = tcp_receive_controller_data,
1500 .send_controller_data = tcp_send_controller_data,
1501 };
1502