1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/capsicum.h> 10 #include <sys/condvar.h> 11 #include <sys/file.h> 12 #include <sys/gsb_crc32.h> 13 #include <sys/kernel.h> 14 #include <sys/kthread.h> 15 #include <sys/limits.h> 16 #include <sys/lock.h> 17 #include <sys/malloc.h> 18 #include <sys/mbuf.h> 19 #include <sys/module.h> 20 #include <sys/mutex.h> 21 #include <sys/protosw.h> 22 #include <sys/refcount.h> 23 #include <sys/socket.h> 24 #include <sys/socketvar.h> 25 #include <sys/sysctl.h> 26 #include <sys/uio.h> 27 #include <netinet/in.h> 28 #include <dev/nvme/nvme.h> 29 #include <dev/nvmf/nvmf.h> 30 #include <dev/nvmf/nvmf_proto.h> 31 #include <dev/nvmf/nvmf_tcp.h> 32 #include <dev/nvmf/nvmf_transport.h> 33 #include <dev/nvmf/nvmf_transport_internal.h> 34 35 struct nvmf_tcp_capsule; 36 struct nvmf_tcp_qpair; 37 38 struct nvmf_tcp_command_buffer { 39 struct nvmf_tcp_qpair *qp; 40 41 struct nvmf_io_request io; 42 size_t data_len; 43 size_t data_xfered; 44 uint32_t data_offset; 45 46 u_int refs; 47 int error; 48 49 uint16_t cid; 50 uint16_t ttag; 51 52 TAILQ_ENTRY(nvmf_tcp_command_buffer) link; 53 54 /* Controller only */ 55 struct nvmf_tcp_capsule *tc; 56 }; 57 58 struct nvmf_tcp_command_buffer_list { 59 TAILQ_HEAD(, nvmf_tcp_command_buffer) head; 60 struct mtx lock; 61 }; 62 63 struct nvmf_tcp_qpair { 64 struct nvmf_qpair qp; 65 66 struct socket *so; 67 68 volatile u_int refs; /* Every allocated capsule holds a reference */ 69 uint8_t txpda; 70 uint8_t rxpda; 71 bool header_digests; 72 bool data_digests; 73 uint32_t maxr2t; 74 uint32_t maxh2cdata; /* Controller only */ 75 uint32_t max_tx_data; 76 uint32_t max_icd; /* Host only */ 77 uint16_t next_ttag; /* Controller only */ 78 u_int num_ttags; /* Controller only */ 79 u_int active_ttags; /* Controller only */ 80 bool send_success; /* Controller only */ 81 82 /* Receive state. */ 83 struct thread *rx_thread; 84 struct cv rx_cv; 85 bool rx_shutdown; 86 87 /* Transmit state. */ 88 struct thread *tx_thread; 89 struct cv tx_cv; 90 bool tx_shutdown; 91 struct mbufq tx_pdus; 92 STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules; 93 94 struct nvmf_tcp_command_buffer_list tx_buffers; 95 struct nvmf_tcp_command_buffer_list rx_buffers; 96 97 /* 98 * For the controller, an RX command buffer can be in one of 99 * two locations, all protected by the rx_buffers.lock. If a 100 * receive request is waiting for either an R2T slot for its 101 * command (due to exceeding MAXR2T), or a transfer tag it is 102 * placed on the rx_buffers list. When a request is allocated 103 * an active transfer tag, it moves to the open_ttags[] array 104 * (indexed by the tag) until it completes. 105 */ 106 struct nvmf_tcp_command_buffer **open_ttags; /* Controller only */ 107 }; 108 109 struct nvmf_tcp_rxpdu { 110 struct mbuf *m; 111 const struct nvme_tcp_common_pdu_hdr *hdr; 112 uint32_t data_len; 113 bool data_digest_mismatch; 114 }; 115 116 struct nvmf_tcp_capsule { 117 struct nvmf_capsule nc; 118 119 volatile u_int refs; 120 121 struct nvmf_tcp_rxpdu rx_pdu; 122 123 uint32_t active_r2ts; /* Controller only */ 124 #ifdef INVARIANTS 125 uint32_t tx_data_offset; /* Controller only */ 126 u_int pending_r2ts; /* Controller only */ 127 #endif 128 129 STAILQ_ENTRY(nvmf_tcp_capsule) link; 130 }; 131 132 #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc)) 133 #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp)) 134 135 static void tcp_release_capsule(struct nvmf_tcp_capsule *tc); 136 static void tcp_free_qpair(struct nvmf_qpair *nq); 137 138 SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 139 "TCP transport"); 140 static u_int tcp_max_transmit_data = 256 * 1024; 141 SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_c2hdata, CTLFLAG_RWTUN, 142 &tcp_max_transmit_data, 0, 143 "Maximum size of data payload in a transmitted PDU"); 144 145 static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP"); 146 147 static int 148 mbuf_crc32c_helper(void *arg, void *data, u_int len) 149 { 150 uint32_t *digestp = arg; 151 152 *digestp = calculate_crc32c(*digestp, data, len); 153 return (0); 154 } 155 156 static uint32_t 157 mbuf_crc32c(struct mbuf *m, u_int offset, u_int len) 158 { 159 uint32_t digest = 0xffffffff; 160 161 m_apply(m, offset, len, mbuf_crc32c_helper, &digest); 162 digest = digest ^ 0xffffffff; 163 164 return (digest); 165 } 166 167 static uint32_t 168 compute_digest(const void *buf, size_t len) 169 { 170 return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); 171 } 172 173 static struct nvmf_tcp_command_buffer * 174 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, 175 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, 176 uint16_t cid) 177 { 178 struct nvmf_tcp_command_buffer *cb; 179 180 cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK); 181 cb->qp = qp; 182 cb->io = *io; 183 cb->data_offset = data_offset; 184 cb->data_len = data_len; 185 cb->data_xfered = 0; 186 refcount_init(&cb->refs, 1); 187 cb->error = 0; 188 cb->cid = cid; 189 cb->ttag = 0; 190 cb->tc = NULL; 191 192 return (cb); 193 } 194 195 static void 196 tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb) 197 { 198 refcount_acquire(&cb->refs); 199 } 200 201 static void 202 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb) 203 { 204 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); 205 if (cb->tc != NULL) 206 tcp_release_capsule(cb->tc); 207 free(cb, M_NVMF_TCP); 208 } 209 210 static void 211 tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb) 212 { 213 if (refcount_release(&cb->refs)) 214 tcp_free_command_buffer(cb); 215 } 216 217 static void 218 tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list, 219 struct nvmf_tcp_command_buffer *cb) 220 { 221 mtx_assert(&list->lock, MA_OWNED); 222 TAILQ_INSERT_HEAD(&list->head, cb, link); 223 } 224 225 static struct nvmf_tcp_command_buffer * 226 tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list, 227 uint16_t cid, uint16_t ttag) 228 { 229 struct nvmf_tcp_command_buffer *cb; 230 231 mtx_assert(&list->lock, MA_OWNED); 232 TAILQ_FOREACH(cb, &list->head, link) { 233 if (cb->cid == cid && cb->ttag == ttag) 234 return (cb); 235 } 236 return (NULL); 237 } 238 239 static void 240 tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list, 241 struct nvmf_tcp_command_buffer *cb) 242 { 243 mtx_assert(&list->lock, MA_OWNED); 244 TAILQ_REMOVE(&list->head, cb, link); 245 } 246 247 static void 248 tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list, 249 uint16_t cid, uint16_t ttag) 250 { 251 struct nvmf_tcp_command_buffer *cb; 252 253 mtx_lock(&list->lock); 254 cb = tcp_find_command_buffer(list, cid, ttag); 255 if (cb != NULL) { 256 tcp_remove_command_buffer(list, cb); 257 mtx_unlock(&list->lock); 258 tcp_release_command_buffer(cb); 259 } else 260 mtx_unlock(&list->lock); 261 } 262 263 static void 264 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m) 265 { 266 struct socket *so = qp->so; 267 268 SOCKBUF_LOCK(&so->so_snd); 269 mbufq_enqueue(&qp->tx_pdus, m); 270 /* XXX: Do we need to handle sb_hiwat being wrong? */ 271 if (sowriteable(so)) 272 cv_signal(&qp->tx_cv); 273 SOCKBUF_UNLOCK(&so->so_snd); 274 } 275 276 static void 277 nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei, 278 struct mbuf *rx_pdu, u_int hlen) 279 { 280 struct nvme_tcp_term_req_hdr *hdr; 281 struct mbuf *m; 282 283 if (hlen != 0) { 284 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); 285 hlen = min(hlen, m_length(rx_pdu, NULL)); 286 } 287 288 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0); 289 m->m_len = sizeof(*hdr) + hlen; 290 hdr = mtod(m, void *); 291 memset(hdr, 0, sizeof(*hdr)); 292 hdr->common.pdu_type = qp->qp.nq_controller ? 293 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; 294 hdr->common.hlen = sizeof(*hdr); 295 hdr->common.plen = sizeof(*hdr) + hlen; 296 hdr->fes = htole16(fes); 297 le32enc(hdr->fei, fei); 298 if (hlen != 0) 299 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); 300 301 nvmf_tcp_write_pdu(qp, m); 302 } 303 304 static int 305 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 306 { 307 const struct nvme_tcp_common_pdu_hdr *ch; 308 struct mbuf *m = pdu->m; 309 uint32_t data_len, fei, plen; 310 uint32_t digest, rx_digest; 311 u_int hlen; 312 int error; 313 uint16_t fes; 314 315 /* Determine how large of a PDU header to return for errors. */ 316 ch = pdu->hdr; 317 hlen = ch->hlen; 318 plen = le32toh(ch->plen); 319 if (hlen < sizeof(*ch) || hlen > plen) 320 hlen = sizeof(*ch); 321 322 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, 323 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, 324 &fei); 325 if (error != 0) { 326 if (error != ECONNRESET) 327 nvmf_tcp_report_error(qp, fes, fei, m, hlen); 328 return (error); 329 } 330 331 /* Check header digest if present. */ 332 if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) { 333 digest = mbuf_crc32c(m, 0, ch->hlen); 334 m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest); 335 if (digest != rx_digest) { 336 printf("NVMe/TCP: Header digest mismatch\n"); 337 nvmf_tcp_report_error(qp, 338 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, 339 hlen); 340 return (EBADMSG); 341 } 342 } 343 344 /* Check data digest if present. */ 345 pdu->data_digest_mismatch = false; 346 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { 347 digest = mbuf_crc32c(m, ch->pdo, data_len); 348 m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest), 349 (caddr_t)&rx_digest); 350 if (digest != rx_digest) { 351 printf("NVMe/TCP: Data digest mismatch\n"); 352 pdu->data_digest_mismatch = true; 353 } 354 } 355 356 pdu->data_len = data_len; 357 return (0); 358 } 359 360 static void 361 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu) 362 { 363 m_freem(pdu->m); 364 pdu->m = NULL; 365 pdu->hdr = NULL; 366 } 367 368 static int 369 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu) 370 { 371 const struct nvme_tcp_term_req_hdr *hdr; 372 373 hdr = (const void *)pdu->hdr; 374 375 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", 376 le16toh(hdr->fes), le32dec(hdr->fei)); 377 nvmf_tcp_free_pdu(pdu); 378 return (ECONNRESET); 379 } 380 381 static int 382 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp, 383 struct nvmf_tcp_rxpdu *pdu) 384 { 385 const struct nvme_tcp_cmd *cmd; 386 struct nvmf_capsule *nc; 387 struct nvmf_tcp_capsule *tc; 388 389 cmd = (const void *)pdu->hdr; 390 391 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); 392 393 tc = TCAP(nc); 394 tc->rx_pdu = *pdu; 395 396 nvmf_capsule_received(&qp->qp, nc); 397 return (0); 398 } 399 400 static int 401 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp, 402 struct nvmf_tcp_rxpdu *pdu) 403 { 404 const struct nvme_tcp_rsp *rsp; 405 struct nvmf_capsule *nc; 406 struct nvmf_tcp_capsule *tc; 407 408 rsp = (const void *)pdu->hdr; 409 410 nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK); 411 412 nc->nc_sqhd_valid = true; 413 tc = TCAP(nc); 414 tc->rx_pdu = *pdu; 415 416 /* 417 * Once the CQE has been received, no further transfers to the 418 * command buffer for the associated CID can occur. 419 */ 420 tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0); 421 tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0); 422 423 nvmf_capsule_received(&qp->qp, nc); 424 return (0); 425 } 426 427 /* 428 * Construct a PDU that contains an optional data payload. This 429 * includes dealing with digests and the length fields in the common 430 * header. 431 */ 432 static struct mbuf * 433 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen, 434 struct mbuf *data, uint32_t data_len) 435 { 436 struct nvme_tcp_common_pdu_hdr *ch; 437 struct mbuf *top; 438 uint32_t digest, pad, pdo, plen, mlen; 439 440 plen = hlen; 441 if (qp->header_digests) 442 plen += sizeof(digest); 443 if (data_len != 0) { 444 KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); 445 pdo = roundup2(plen, qp->txpda); 446 pad = pdo - plen; 447 plen = pdo + data_len; 448 if (qp->data_digests) 449 plen += sizeof(digest); 450 mlen = pdo; 451 } else { 452 KASSERT(data == NULL, ("payload mbuf with zero length")); 453 pdo = 0; 454 pad = 0; 455 mlen = plen; 456 } 457 458 top = m_get2(mlen, M_WAITOK, MT_DATA, 0); 459 top->m_len = mlen; 460 ch = mtod(top, void *); 461 memcpy(ch, hdr, hlen); 462 ch->hlen = hlen; 463 if (qp->header_digests) 464 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; 465 if (qp->data_digests && data_len != 0) 466 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; 467 ch->pdo = pdo; 468 ch->plen = htole32(plen); 469 470 /* HDGST */ 471 if (qp->header_digests) { 472 digest = compute_digest(ch, hlen); 473 memcpy((char *)ch + hlen, &digest, sizeof(digest)); 474 } 475 476 if (pad != 0) { 477 /* PAD */ 478 memset((char *)ch + pdo - pad, 0, pad); 479 } 480 481 if (data_len != 0) { 482 /* DATA */ 483 top->m_next = data; 484 485 /* DDGST */ 486 if (qp->data_digests) { 487 digest = mbuf_crc32c(data, 0, data_len); 488 489 /* XXX: Can't use m_append as it uses M_NOWAIT. */ 490 while (data->m_next != NULL) 491 data = data->m_next; 492 493 data->m_next = m_get(M_WAITOK, MT_DATA); 494 data->m_next->m_len = sizeof(digest); 495 memcpy(mtod(data->m_next, void *), &digest, 496 sizeof(digest)); 497 } 498 } 499 500 return (top); 501 } 502 503 /* Find the next command buffer eligible to schedule for R2T. */ 504 static struct nvmf_tcp_command_buffer * 505 nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp) 506 { 507 struct nvmf_tcp_command_buffer *cb; 508 509 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 510 MPASS(qp->active_ttags < qp->num_ttags); 511 512 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { 513 /* NB: maxr2t is 0's based. */ 514 if (cb->tc->active_r2ts > qp->maxr2t) 515 continue; 516 #ifdef INVARIANTS 517 cb->tc->pending_r2ts--; 518 #endif 519 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); 520 return (cb); 521 } 522 return (NULL); 523 } 524 525 /* Allocate the next free transfer tag and assign it to cb. */ 526 static void 527 nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp, 528 struct nvmf_tcp_command_buffer *cb) 529 { 530 uint16_t ttag; 531 532 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 533 534 ttag = qp->next_ttag; 535 for (;;) { 536 if (qp->open_ttags[ttag] == NULL) 537 break; 538 if (ttag == qp->num_ttags - 1) 539 ttag = 0; 540 else 541 ttag++; 542 MPASS(ttag != qp->next_ttag); 543 } 544 if (ttag == qp->num_ttags - 1) 545 qp->next_ttag = 0; 546 else 547 qp->next_ttag = ttag + 1; 548 549 cb->tc->active_r2ts++; 550 qp->active_ttags++; 551 qp->open_ttags[ttag] = cb; 552 553 /* 554 * Don't bother byte-swapping ttag as it is just a cookie 555 * value returned by the other end as-is. 556 */ 557 cb->ttag = ttag; 558 } 559 560 /* NB: cid and ttag are both little-endian already. */ 561 static void 562 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, 563 uint32_t data_offset, uint32_t data_len) 564 { 565 struct nvme_tcp_r2t_hdr r2t; 566 struct mbuf *m; 567 568 memset(&r2t, 0, sizeof(r2t)); 569 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; 570 r2t.cccid = cid; 571 r2t.ttag = ttag; 572 r2t.r2to = htole32(data_offset); 573 r2t.r2tl = htole32(data_len); 574 575 m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); 576 nvmf_tcp_write_pdu(qp, m); 577 } 578 579 /* 580 * Release a transfer tag and schedule another R2T. 581 * 582 * NB: This drops the rx_buffers.lock mutex. 583 */ 584 static void 585 nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp, 586 struct nvmf_tcp_command_buffer *cb) 587 { 588 struct nvmf_tcp_command_buffer *ncb; 589 590 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 591 MPASS(qp->open_ttags[cb->ttag] == cb); 592 593 /* Release this transfer tag. */ 594 qp->open_ttags[cb->ttag] = NULL; 595 qp->active_ttags--; 596 cb->tc->active_r2ts--; 597 598 /* Schedule another R2T. */ 599 ncb = nvmf_tcp_next_r2t(qp); 600 if (ncb != NULL) { 601 nvmf_tcp_allocate_ttag(qp, ncb); 602 mtx_unlock(&qp->rx_buffers.lock); 603 tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, 604 ncb->data_len); 605 } else 606 mtx_unlock(&qp->rx_buffers.lock); 607 } 608 609 /* 610 * Copy len bytes starting at offset skip from an mbuf chain into an 611 * I/O buffer at destination offset io_offset. 612 */ 613 static void 614 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, 615 struct nvmf_io_request *io, u_int io_offset) 616 { 617 u_int todo; 618 619 while (m->m_len <= skip) { 620 skip -= m->m_len; 621 m = m->m_next; 622 } 623 while (len != 0) { 624 MPASS((m->m_flags & M_EXTPG) == 0); 625 626 todo = m->m_len - skip; 627 if (todo > len) 628 todo = len; 629 630 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); 631 skip = 0; 632 io_offset += todo; 633 len -= todo; 634 m = m->m_next; 635 } 636 } 637 638 static int 639 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 640 { 641 const struct nvme_tcp_h2c_data_hdr *h2c; 642 struct nvmf_tcp_command_buffer *cb; 643 uint32_t data_len, data_offset; 644 uint16_t ttag; 645 646 h2c = (const void *)pdu->hdr; 647 if (le32toh(h2c->datal) > qp->maxh2cdata) { 648 nvmf_tcp_report_error(qp, 649 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, 650 pdu->m, pdu->hdr->hlen); 651 nvmf_tcp_free_pdu(pdu); 652 return (EBADMSG); 653 } 654 655 /* 656 * NB: Don't bother byte-swapping ttag as we don't byte-swap 657 * it when sending. 658 */ 659 ttag = h2c->ttag; 660 if (ttag >= qp->num_ttags) { 661 nvmf_tcp_report_error(qp, 662 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 663 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, 664 pdu->hdr->hlen); 665 nvmf_tcp_free_pdu(pdu); 666 return (EBADMSG); 667 } 668 669 mtx_lock(&qp->rx_buffers.lock); 670 cb = qp->open_ttags[ttag]; 671 if (cb == NULL) { 672 mtx_unlock(&qp->rx_buffers.lock); 673 nvmf_tcp_report_error(qp, 674 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 675 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, 676 pdu->hdr->hlen); 677 nvmf_tcp_free_pdu(pdu); 678 return (EBADMSG); 679 } 680 MPASS(cb->ttag == ttag); 681 682 /* For a data digest mismatch, fail the I/O request. */ 683 if (pdu->data_digest_mismatch) { 684 nvmf_tcp_send_next_r2t(qp, cb); 685 cb->error = EINTEGRITY; 686 tcp_release_command_buffer(cb); 687 nvmf_tcp_free_pdu(pdu); 688 return (0); 689 } 690 691 data_len = le32toh(h2c->datal); 692 if (data_len != pdu->data_len) { 693 mtx_unlock(&qp->rx_buffers.lock); 694 nvmf_tcp_report_error(qp, 695 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 696 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, 697 pdu->hdr->hlen); 698 nvmf_tcp_free_pdu(pdu); 699 return (EBADMSG); 700 } 701 702 data_offset = le32toh(h2c->datao); 703 if (data_offset < cb->data_offset || 704 data_offset + data_len > cb->data_offset + cb->data_len) { 705 mtx_unlock(&qp->rx_buffers.lock); 706 nvmf_tcp_report_error(qp, 707 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, 708 pdu->hdr->hlen); 709 nvmf_tcp_free_pdu(pdu); 710 return (EBADMSG); 711 } 712 713 if (data_offset != cb->data_offset + cb->data_xfered) { 714 mtx_unlock(&qp->rx_buffers.lock); 715 nvmf_tcp_report_error(qp, 716 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 717 pdu->hdr->hlen); 718 nvmf_tcp_free_pdu(pdu); 719 return (EBADMSG); 720 } 721 722 if ((cb->data_xfered + data_len == cb->data_len) != 723 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { 724 mtx_unlock(&qp->rx_buffers.lock); 725 nvmf_tcp_report_error(qp, 726 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 727 pdu->hdr->hlen); 728 nvmf_tcp_free_pdu(pdu); 729 return (EBADMSG); 730 } 731 732 cb->data_xfered += data_len; 733 data_offset -= cb->data_offset; 734 if (cb->data_xfered == cb->data_len) { 735 nvmf_tcp_send_next_r2t(qp, cb); 736 } else { 737 tcp_hold_command_buffer(cb); 738 mtx_unlock(&qp->rx_buffers.lock); 739 } 740 741 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset); 742 743 tcp_release_command_buffer(cb); 744 nvmf_tcp_free_pdu(pdu); 745 return (0); 746 } 747 748 static int 749 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 750 { 751 const struct nvme_tcp_c2h_data_hdr *c2h; 752 struct nvmf_tcp_command_buffer *cb; 753 uint32_t data_len, data_offset; 754 755 c2h = (const void *)pdu->hdr; 756 757 mtx_lock(&qp->rx_buffers.lock); 758 cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0); 759 if (cb == NULL) { 760 mtx_unlock(&qp->rx_buffers.lock); 761 /* 762 * XXX: Could be PDU sequence error if cccid is for a 763 * command that doesn't use a command buffer. 764 */ 765 nvmf_tcp_report_error(qp, 766 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 767 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, 768 pdu->hdr->hlen); 769 nvmf_tcp_free_pdu(pdu); 770 return (EBADMSG); 771 } 772 773 /* For a data digest mismatch, fail the I/O request. */ 774 if (pdu->data_digest_mismatch) { 775 cb->error = EINTEGRITY; 776 tcp_remove_command_buffer(&qp->rx_buffers, cb); 777 mtx_unlock(&qp->rx_buffers.lock); 778 tcp_release_command_buffer(cb); 779 nvmf_tcp_free_pdu(pdu); 780 return (0); 781 } 782 783 data_len = le32toh(c2h->datal); 784 if (data_len != pdu->data_len) { 785 mtx_unlock(&qp->rx_buffers.lock); 786 nvmf_tcp_report_error(qp, 787 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 788 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, 789 pdu->hdr->hlen); 790 nvmf_tcp_free_pdu(pdu); 791 return (EBADMSG); 792 } 793 794 data_offset = le32toh(c2h->datao); 795 if (data_offset < cb->data_offset || 796 data_offset + data_len > cb->data_offset + cb->data_len) { 797 mtx_unlock(&qp->rx_buffers.lock); 798 nvmf_tcp_report_error(qp, 799 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 800 pdu->m, pdu->hdr->hlen); 801 nvmf_tcp_free_pdu(pdu); 802 return (EBADMSG); 803 } 804 805 if (data_offset != cb->data_offset + cb->data_xfered) { 806 mtx_unlock(&qp->rx_buffers.lock); 807 nvmf_tcp_report_error(qp, 808 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 809 pdu->hdr->hlen); 810 nvmf_tcp_free_pdu(pdu); 811 return (EBADMSG); 812 } 813 814 if ((cb->data_xfered + data_len == cb->data_len) != 815 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { 816 mtx_unlock(&qp->rx_buffers.lock); 817 nvmf_tcp_report_error(qp, 818 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 819 pdu->hdr->hlen); 820 nvmf_tcp_free_pdu(pdu); 821 return (EBADMSG); 822 } 823 824 cb->data_xfered += data_len; 825 data_offset -= cb->data_offset; 826 if (cb->data_xfered == cb->data_len) 827 tcp_remove_command_buffer(&qp->rx_buffers, cb); 828 else 829 tcp_hold_command_buffer(cb); 830 mtx_unlock(&qp->rx_buffers.lock); 831 832 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset); 833 834 tcp_release_command_buffer(cb); 835 836 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 837 struct nvme_completion cqe; 838 struct nvmf_capsule *nc; 839 840 memset(&cqe, 0, sizeof(cqe)); 841 cqe.cid = c2h->cccid; 842 843 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); 844 nc->nc_sqhd_valid = false; 845 846 nvmf_capsule_received(&qp->qp, nc); 847 } 848 849 nvmf_tcp_free_pdu(pdu); 850 return (0); 851 } 852 853 /* Called when m_free drops refcount to 0. */ 854 static void 855 nvmf_tcp_mbuf_done(struct mbuf *m) 856 { 857 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1; 858 859 tcp_free_command_buffer(cb); 860 } 861 862 static struct mbuf * 863 nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len) 864 { 865 struct nvmf_tcp_command_buffer *cb = arg; 866 struct mbuf *m; 867 868 m = m_get(how, MT_DATA); 869 m->m_flags |= M_RDONLY; 870 m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL); 871 m->m_len = len; 872 return (m); 873 } 874 875 static void 876 nvmf_tcp_free_mext_pg(struct mbuf *m) 877 { 878 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1; 879 880 M_ASSERTEXTPG(m); 881 tcp_release_command_buffer(cb); 882 } 883 884 static struct mbuf * 885 nvmf_tcp_mext_pg(void *arg, int how) 886 { 887 struct nvmf_tcp_command_buffer *cb = arg; 888 struct mbuf *m; 889 890 m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg); 891 m->m_ext.ext_arg1 = cb; 892 tcp_hold_command_buffer(cb); 893 return (m); 894 } 895 896 /* 897 * Return an mbuf chain for a range of data belonging to a command 898 * buffer. 899 * 900 * The mbuf chain uses M_EXT mbufs which hold references on the 901 * command buffer so that it remains "alive" until the data has been 902 * fully transmitted. If truncate_ok is true, then the mbuf chain 903 * might return a short chain to avoid gratuitously splitting up a 904 * page. 905 */ 906 static struct mbuf * 907 nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb, 908 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, 909 bool can_truncate) 910 { 911 struct mbuf *m; 912 size_t len; 913 914 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf, 915 nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, 916 can_truncate); 917 if (actual_len != NULL) 918 *actual_len = len; 919 return (m); 920 } 921 922 /* NB: cid and ttag and little-endian already. */ 923 static void 924 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, 925 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) 926 { 927 struct nvme_tcp_h2c_data_hdr h2c; 928 struct mbuf *top; 929 930 memset(&h2c, 0, sizeof(h2c)); 931 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; 932 if (last_pdu) 933 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; 934 h2c.cccid = cid; 935 h2c.ttag = ttag; 936 h2c.datao = htole32(data_offset); 937 h2c.datal = htole32(len); 938 939 top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len); 940 nvmf_tcp_write_pdu(qp, top); 941 } 942 943 static int 944 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 945 { 946 const struct nvme_tcp_r2t_hdr *r2t; 947 struct nvmf_tcp_command_buffer *cb; 948 uint32_t data_len, data_offset; 949 950 r2t = (const void *)pdu->hdr; 951 952 mtx_lock(&qp->tx_buffers.lock); 953 cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0); 954 if (cb == NULL) { 955 mtx_unlock(&qp->tx_buffers.lock); 956 nvmf_tcp_report_error(qp, 957 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 958 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, 959 pdu->hdr->hlen); 960 nvmf_tcp_free_pdu(pdu); 961 return (EBADMSG); 962 } 963 964 data_offset = le32toh(r2t->r2to); 965 if (data_offset != cb->data_xfered) { 966 mtx_unlock(&qp->tx_buffers.lock); 967 nvmf_tcp_report_error(qp, 968 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 969 pdu->hdr->hlen); 970 nvmf_tcp_free_pdu(pdu); 971 return (EBADMSG); 972 } 973 974 /* 975 * XXX: The spec does not specify how to handle R2T tranfers 976 * out of range of the original command. 977 */ 978 data_len = le32toh(r2t->r2tl); 979 if (data_offset + data_len > cb->data_len) { 980 mtx_unlock(&qp->tx_buffers.lock); 981 nvmf_tcp_report_error(qp, 982 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 983 pdu->m, pdu->hdr->hlen); 984 nvmf_tcp_free_pdu(pdu); 985 return (EBADMSG); 986 } 987 988 cb->data_xfered += data_len; 989 if (cb->data_xfered == cb->data_len) 990 tcp_remove_command_buffer(&qp->tx_buffers, cb); 991 else 992 tcp_hold_command_buffer(cb); 993 mtx_unlock(&qp->tx_buffers.lock); 994 995 /* 996 * Queue one or more H2C_DATA PDUs containing the requested 997 * data. 998 */ 999 while (data_len > 0) { 1000 struct mbuf *m; 1001 uint32_t sent, todo; 1002 1003 todo = data_len; 1004 if (todo > qp->max_tx_data) 1005 todo = qp->max_tx_data; 1006 m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent, 1007 todo < data_len); 1008 tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, 1009 sent, sent == data_len); 1010 1011 data_offset += sent; 1012 data_len -= sent; 1013 } 1014 1015 tcp_release_command_buffer(cb); 1016 nvmf_tcp_free_pdu(pdu); 1017 return (0); 1018 } 1019 1020 /* 1021 * A variant of m_pullup that uses M_WAITOK instead of failing. It 1022 * also doesn't do anything if enough bytes are already present in the 1023 * first mbuf. 1024 */ 1025 static struct mbuf * 1026 pullup_pdu_hdr(struct mbuf *m, int len) 1027 { 1028 struct mbuf *n, *p; 1029 1030 KASSERT(len <= MCLBYTES, ("%s: len too large", __func__)); 1031 if (m->m_len >= len) 1032 return (m); 1033 1034 n = m_get2(len, M_WAITOK, MT_DATA, 0); 1035 n->m_len = len; 1036 m_copydata(m, 0, len, mtod(n, void *)); 1037 1038 while (m != NULL && m->m_len <= len) { 1039 p = m->m_next; 1040 len -= m->m_len; 1041 m_free(m); 1042 m = p; 1043 } 1044 if (len > 0) { 1045 m->m_data += len; 1046 m->m_len -= len; 1047 } 1048 n->m_next = m; 1049 return (n); 1050 } 1051 1052 static int 1053 nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp, 1054 const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu) 1055 { 1056 /* Ensure the PDU header is contiguous. */ 1057 pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen); 1058 pdu->hdr = mtod(pdu->m, const void *); 1059 1060 switch (ch->pdu_type) { 1061 default: 1062 __assert_unreachable(); 1063 break; 1064 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: 1065 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1066 return (nvmf_tcp_handle_term_req(pdu)); 1067 case NVME_TCP_PDU_TYPE_CAPSULE_CMD: 1068 return (nvmf_tcp_save_command_capsule(qp, pdu)); 1069 case NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1070 return (nvmf_tcp_save_response_capsule(qp, pdu)); 1071 case NVME_TCP_PDU_TYPE_H2C_DATA: 1072 return (nvmf_tcp_handle_h2c_data(qp, pdu)); 1073 case NVME_TCP_PDU_TYPE_C2H_DATA: 1074 return (nvmf_tcp_handle_c2h_data(qp, pdu)); 1075 case NVME_TCP_PDU_TYPE_R2T: 1076 return (nvmf_tcp_handle_r2t(qp, pdu)); 1077 } 1078 } 1079 1080 static void 1081 nvmf_tcp_receive(void *arg) 1082 { 1083 struct nvmf_tcp_qpair *qp = arg; 1084 struct socket *so = qp->so; 1085 struct nvmf_tcp_rxpdu pdu; 1086 struct nvme_tcp_common_pdu_hdr ch; 1087 struct uio uio; 1088 struct iovec iov[1]; 1089 struct mbuf *m, *n, *tail; 1090 u_int avail, needed; 1091 int error, flags, terror; 1092 bool have_header; 1093 1094 m = tail = NULL; 1095 have_header = false; 1096 SOCKBUF_LOCK(&so->so_rcv); 1097 while (!qp->rx_shutdown) { 1098 /* Wait until there is enough data for the next step. */ 1099 if (so->so_error != 0 || so->so_rerror != 0) { 1100 if (so->so_error != 0) 1101 error = so->so_error; 1102 else 1103 error = so->so_rerror; 1104 SOCKBUF_UNLOCK(&so->so_rcv); 1105 error: 1106 m_freem(m); 1107 nvmf_qpair_error(&qp->qp, error); 1108 SOCKBUF_LOCK(&so->so_rcv); 1109 while (!qp->rx_shutdown) 1110 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1111 break; 1112 } 1113 avail = sbavail(&so->so_rcv); 1114 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { 1115 if (!have_header && avail == 0) 1116 error = 0; 1117 else 1118 error = ECONNRESET; 1119 SOCKBUF_UNLOCK(&so->so_rcv); 1120 goto error; 1121 } 1122 if (avail == 0 || (!have_header && avail < sizeof(ch))) { 1123 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1124 continue; 1125 } 1126 SOCKBUF_UNLOCK(&so->so_rcv); 1127 1128 if (!have_header) { 1129 KASSERT(m == NULL, ("%s: m != NULL but no header", 1130 __func__)); 1131 memset(&uio, 0, sizeof(uio)); 1132 iov[0].iov_base = &ch; 1133 iov[0].iov_len = sizeof(ch); 1134 uio.uio_iov = iov; 1135 uio.uio_iovcnt = 1; 1136 uio.uio_resid = sizeof(ch); 1137 uio.uio_segflg = UIO_SYSSPACE; 1138 uio.uio_rw = UIO_READ; 1139 flags = MSG_DONTWAIT | MSG_PEEK; 1140 1141 error = soreceive(so, NULL, &uio, NULL, NULL, &flags); 1142 if (error != 0) 1143 goto error; 1144 KASSERT(uio.uio_resid == 0, ("%s: short CH read", 1145 __func__)); 1146 1147 have_header = true; 1148 needed = le32toh(ch.plen); 1149 1150 /* 1151 * Malformed PDUs will be reported as errors 1152 * by nvmf_tcp_validate_pdu. Just pass along 1153 * garbage headers if the lengths mismatch. 1154 */ 1155 if (needed < sizeof(ch) || ch.hlen > needed) 1156 needed = sizeof(ch); 1157 1158 memset(&uio, 0, sizeof(uio)); 1159 uio.uio_resid = needed; 1160 } 1161 1162 flags = MSG_DONTWAIT; 1163 error = soreceive(so, NULL, &uio, &n, NULL, &flags); 1164 if (error != 0) 1165 goto error; 1166 1167 if (m == NULL) 1168 m = n; 1169 else 1170 tail->m_next = n; 1171 1172 if (uio.uio_resid != 0) { 1173 tail = n; 1174 while (tail->m_next != NULL) 1175 tail = tail->m_next; 1176 1177 SOCKBUF_LOCK(&so->so_rcv); 1178 continue; 1179 } 1180 #ifdef INVARIANTS 1181 tail = NULL; 1182 #endif 1183 1184 pdu.m = m; 1185 m = NULL; 1186 pdu.hdr = &ch; 1187 error = nvmf_tcp_validate_pdu(qp, &pdu); 1188 if (error != 0) 1189 m_freem(pdu.m); 1190 else 1191 error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu); 1192 if (error != 0) { 1193 /* 1194 * If we received a termination request, close 1195 * the connection immediately. 1196 */ 1197 if (error == ECONNRESET) 1198 goto error; 1199 1200 /* 1201 * Wait for up to 30 seconds for the socket to 1202 * be closed by the other end. 1203 */ 1204 SOCKBUF_LOCK(&so->so_rcv); 1205 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1206 terror = cv_timedwait(&qp->rx_cv, 1207 SOCKBUF_MTX(&so->so_rcv), 30 * hz); 1208 if (terror == ETIMEDOUT) 1209 printf("NVMe/TCP: Timed out after sending terminate request\n"); 1210 } 1211 SOCKBUF_UNLOCK(&so->so_rcv); 1212 goto error; 1213 } 1214 1215 have_header = false; 1216 SOCKBUF_LOCK(&so->so_rcv); 1217 } 1218 SOCKBUF_UNLOCK(&so->so_rcv); 1219 kthread_exit(); 1220 } 1221 1222 static struct mbuf * 1223 tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) 1224 { 1225 struct nvmf_capsule *nc = &tc->nc; 1226 struct nvmf_tcp_command_buffer *cb; 1227 struct nvme_sgl_descriptor *sgl; 1228 struct nvme_tcp_cmd cmd; 1229 struct mbuf *top, *m; 1230 bool use_icd; 1231 1232 use_icd = false; 1233 cb = NULL; 1234 m = NULL; 1235 1236 if (nc->nc_data.io_len != 0) { 1237 cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0, 1238 nc->nc_data.io_len, nc->nc_sqe.cid); 1239 1240 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { 1241 use_icd = true; 1242 m = nvmf_tcp_command_buffer_mbuf(cb, 0, 1243 nc->nc_data.io_len, NULL, false); 1244 cb->data_xfered = nc->nc_data.io_len; 1245 tcp_release_command_buffer(cb); 1246 } else if (nc->nc_send_data) { 1247 mtx_lock(&qp->tx_buffers.lock); 1248 tcp_add_command_buffer(&qp->tx_buffers, cb); 1249 mtx_unlock(&qp->tx_buffers.lock); 1250 } else { 1251 mtx_lock(&qp->rx_buffers.lock); 1252 tcp_add_command_buffer(&qp->rx_buffers, cb); 1253 mtx_unlock(&qp->rx_buffers.lock); 1254 } 1255 } 1256 1257 memset(&cmd, 0, sizeof(cmd)); 1258 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; 1259 cmd.ccsqe = nc->nc_sqe; 1260 1261 /* Populate SGL in SQE. */ 1262 sgl = &cmd.ccsqe.sgl; 1263 memset(sgl, 0, sizeof(*sgl)); 1264 sgl->address = 0; 1265 sgl->length = htole32(nc->nc_data.io_len); 1266 if (use_icd) { 1267 /* Use in-capsule data. */ 1268 sgl->type = NVME_SGL_TYPE_ICD; 1269 } else { 1270 /* Use a command buffer. */ 1271 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; 1272 } 1273 1274 top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? 1275 nc->nc_data.io_len : 0); 1276 return (top); 1277 } 1278 1279 static struct mbuf * 1280 tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) 1281 { 1282 struct nvmf_capsule *nc = &tc->nc; 1283 struct nvme_tcp_rsp rsp; 1284 1285 memset(&rsp, 0, sizeof(rsp)); 1286 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; 1287 rsp.rccqe = nc->nc_cqe; 1288 1289 return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); 1290 } 1291 1292 static struct mbuf * 1293 capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) 1294 { 1295 if (tc->nc.nc_qe_len == sizeof(struct nvme_command)) 1296 return (tcp_command_pdu(qp, tc)); 1297 else 1298 return (tcp_response_pdu(qp, tc)); 1299 } 1300 1301 static void 1302 nvmf_tcp_send(void *arg) 1303 { 1304 struct nvmf_tcp_qpair *qp = arg; 1305 struct nvmf_tcp_capsule *tc; 1306 struct socket *so = qp->so; 1307 struct mbuf *m, *n, *p; 1308 u_long space, tosend; 1309 int error; 1310 1311 m = NULL; 1312 SOCKBUF_LOCK(&so->so_snd); 1313 while (!qp->tx_shutdown) { 1314 if (so->so_error != 0) { 1315 error = so->so_error; 1316 SOCKBUF_UNLOCK(&so->so_snd); 1317 error: 1318 m_freem(m); 1319 nvmf_qpair_error(&qp->qp, error); 1320 SOCKBUF_LOCK(&so->so_snd); 1321 while (!qp->tx_shutdown) 1322 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 1323 break; 1324 } 1325 1326 if (m == NULL) { 1327 /* Next PDU to send. */ 1328 m = mbufq_dequeue(&qp->tx_pdus); 1329 } 1330 if (m == NULL) { 1331 if (STAILQ_EMPTY(&qp->tx_capsules)) { 1332 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 1333 continue; 1334 } 1335 1336 /* Convert a capsule into a PDU. */ 1337 tc = STAILQ_FIRST(&qp->tx_capsules); 1338 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); 1339 SOCKBUF_UNLOCK(&so->so_snd); 1340 1341 n = capsule_to_pdu(qp, tc); 1342 tcp_release_capsule(tc); 1343 1344 SOCKBUF_LOCK(&so->so_snd); 1345 mbufq_enqueue(&qp->tx_pdus, n); 1346 continue; 1347 } 1348 1349 /* 1350 * Wait until there is enough room to send some data. 1351 * If the socket buffer is empty, always send at least 1352 * something. 1353 */ 1354 space = sbspace(&so->so_snd); 1355 if (space < m->m_len && sbused(&so->so_snd) != 0) { 1356 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 1357 continue; 1358 } 1359 SOCKBUF_UNLOCK(&so->so_snd); 1360 1361 /* 1362 * If 'm' is too big, then the socket buffer must be 1363 * empty. Split 'm' to make at least some forward 1364 * progress. 1365 * 1366 * Otherwise, chain up as many pending mbufs from 'm' 1367 * that will fit. 1368 */ 1369 if (m->m_len > space) { 1370 n = m_split(m, space, M_WAITOK); 1371 } else { 1372 tosend = m->m_len; 1373 n = m->m_next; 1374 p = m; 1375 while (n != NULL && tosend + n->m_len <= space) { 1376 tosend += n->m_len; 1377 p = n; 1378 n = n->m_next; 1379 } 1380 KASSERT(p->m_next == n, ("%s: p not before n", 1381 __func__)); 1382 p->m_next = NULL; 1383 1384 KASSERT(m_length(m, NULL) == tosend, 1385 ("%s: length mismatch", __func__)); 1386 } 1387 error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL); 1388 if (error != 0) { 1389 m = NULL; 1390 m_freem(n); 1391 goto error; 1392 } 1393 m = n; 1394 SOCKBUF_LOCK(&so->so_snd); 1395 } 1396 SOCKBUF_UNLOCK(&so->so_snd); 1397 kthread_exit(); 1398 } 1399 1400 static int 1401 nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag) 1402 { 1403 struct nvmf_tcp_qpair *qp = arg; 1404 1405 if (soreadable(so)) 1406 cv_signal(&qp->rx_cv); 1407 return (SU_OK); 1408 } 1409 1410 static int 1411 nvmf_soupcall_send(struct socket *so, void *arg, int waitflag) 1412 { 1413 struct nvmf_tcp_qpair *qp = arg; 1414 1415 if (sowriteable(so)) 1416 cv_signal(&qp->tx_cv); 1417 return (SU_OK); 1418 } 1419 1420 static struct nvmf_qpair * 1421 tcp_allocate_qpair(bool controller, 1422 const struct nvmf_handoff_qpair_params *params) 1423 { 1424 struct nvmf_tcp_qpair *qp; 1425 struct socket *so; 1426 struct file *fp; 1427 cap_rights_t rights; 1428 int error; 1429 1430 error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights, 1431 CAP_SOCK_CLIENT), &fp); 1432 if (error != 0) 1433 return (NULL); 1434 if (fp->f_type != DTYPE_SOCKET) { 1435 fdrop(fp, curthread); 1436 return (NULL); 1437 } 1438 so = fp->f_data; 1439 if (so->so_type != SOCK_STREAM || 1440 so->so_proto->pr_protocol != IPPROTO_TCP) { 1441 fdrop(fp, curthread); 1442 return (NULL); 1443 } 1444 1445 /* Claim socket from file descriptor. */ 1446 fp->f_ops = &badfileops; 1447 fp->f_data = NULL; 1448 fdrop(fp, curthread); 1449 1450 qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO); 1451 qp->so = so; 1452 refcount_init(&qp->refs, 1); 1453 qp->txpda = params->tcp.txpda; 1454 qp->rxpda = params->tcp.rxpda; 1455 qp->header_digests = params->tcp.header_digests; 1456 qp->data_digests = params->tcp.data_digests; 1457 qp->maxr2t = params->tcp.maxr2t; 1458 qp->maxh2cdata = params->tcp.maxh2cdata; 1459 qp->max_tx_data = tcp_max_transmit_data; 1460 if (!controller) { 1461 if (qp->max_tx_data > params->tcp.maxh2cdata) 1462 qp->max_tx_data = params->tcp.maxh2cdata; 1463 } 1464 qp->max_icd = params->tcp.max_icd; 1465 1466 if (controller) { 1467 /* Use the SUCCESS flag if SQ flow control is disabled. */ 1468 qp->send_success = !params->sq_flow_control; 1469 1470 /* NB: maxr2t is 0's based. */ 1471 qp->num_ttags = MIN((u_int)UINT16_MAX + 1, 1472 (uint64_t)params->qsize * (uint64_t)qp->maxr2t + 1); 1473 qp->open_ttags = mallocarray(qp->num_ttags, 1474 sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO); 1475 } 1476 1477 TAILQ_INIT(&qp->rx_buffers.head); 1478 TAILQ_INIT(&qp->tx_buffers.head); 1479 mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF); 1480 mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF); 1481 1482 cv_init(&qp->rx_cv, "-"); 1483 cv_init(&qp->tx_cv, "-"); 1484 mbufq_init(&qp->tx_pdus, 0); 1485 STAILQ_INIT(&qp->tx_capsules); 1486 1487 /* Register socket upcalls. */ 1488 SOCKBUF_LOCK(&so->so_rcv); 1489 soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp); 1490 SOCKBUF_UNLOCK(&so->so_rcv); 1491 SOCKBUF_LOCK(&so->so_snd); 1492 soupcall_set(so, SO_SND, nvmf_soupcall_send, qp); 1493 SOCKBUF_UNLOCK(&so->so_snd); 1494 1495 /* Spin up kthreads. */ 1496 error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0, 1497 "nvmef tcp rx"); 1498 if (error != 0) { 1499 tcp_free_qpair(&qp->qp); 1500 return (NULL); 1501 } 1502 error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0, 1503 "nvmef tcp tx"); 1504 if (error != 0) { 1505 tcp_free_qpair(&qp->qp); 1506 return (NULL); 1507 } 1508 1509 return (&qp->qp); 1510 } 1511 1512 static void 1513 tcp_release_qpair(struct nvmf_tcp_qpair *qp) 1514 { 1515 if (refcount_release(&qp->refs)) 1516 free(qp, M_NVMF_TCP); 1517 } 1518 1519 static void 1520 tcp_free_qpair(struct nvmf_qpair *nq) 1521 { 1522 struct nvmf_tcp_qpair *qp = TQP(nq); 1523 struct nvmf_tcp_command_buffer *ncb, *cb; 1524 struct nvmf_tcp_capsule *ntc, *tc; 1525 struct socket *so = qp->so; 1526 1527 /* Shut down kthreads and clear upcalls */ 1528 SOCKBUF_LOCK(&so->so_snd); 1529 qp->tx_shutdown = true; 1530 if (qp->tx_thread != NULL) { 1531 cv_signal(&qp->tx_cv); 1532 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, 1533 "nvtcptx", 0); 1534 } 1535 soupcall_clear(so, SO_SND); 1536 SOCKBUF_UNLOCK(&so->so_snd); 1537 1538 SOCKBUF_LOCK(&so->so_rcv); 1539 qp->rx_shutdown = true; 1540 if (qp->rx_thread != NULL) { 1541 cv_signal(&qp->rx_cv); 1542 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, 1543 "nvtcprx", 0); 1544 } 1545 soupcall_clear(so, SO_RCV); 1546 SOCKBUF_UNLOCK(&so->so_rcv); 1547 1548 STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) { 1549 nvmf_abort_capsule_data(&tc->nc, ECONNABORTED); 1550 tcp_release_capsule(tc); 1551 } 1552 mbufq_drain(&qp->tx_pdus); 1553 1554 cv_destroy(&qp->tx_cv); 1555 cv_destroy(&qp->rx_cv); 1556 1557 if (qp->open_ttags != NULL) { 1558 for (u_int i = 0; i < qp->num_ttags; i++) { 1559 cb = qp->open_ttags[i]; 1560 if (cb != NULL) { 1561 cb->error = ECONNABORTED; 1562 tcp_release_command_buffer(cb); 1563 } 1564 } 1565 free(qp->open_ttags, M_NVMF_TCP); 1566 } 1567 1568 mtx_lock(&qp->rx_buffers.lock); 1569 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { 1570 tcp_remove_command_buffer(&qp->rx_buffers, cb); 1571 mtx_unlock(&qp->rx_buffers.lock); 1572 cb->error = ECONNABORTED; 1573 tcp_release_command_buffer(cb); 1574 mtx_lock(&qp->rx_buffers.lock); 1575 } 1576 mtx_destroy(&qp->rx_buffers.lock); 1577 1578 mtx_lock(&qp->tx_buffers.lock); 1579 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { 1580 tcp_remove_command_buffer(&qp->tx_buffers, cb); 1581 mtx_unlock(&qp->tx_buffers.lock); 1582 cb->error = ECONNABORTED; 1583 tcp_release_command_buffer(cb); 1584 mtx_lock(&qp->tx_buffers.lock); 1585 } 1586 mtx_destroy(&qp->tx_buffers.lock); 1587 1588 soclose(so); 1589 1590 tcp_release_qpair(qp); 1591 } 1592 1593 static struct nvmf_capsule * 1594 tcp_allocate_capsule(struct nvmf_qpair *nq, int how) 1595 { 1596 struct nvmf_tcp_qpair *qp = TQP(nq); 1597 struct nvmf_tcp_capsule *tc; 1598 1599 tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO); 1600 if (tc == NULL) 1601 return (NULL); 1602 refcount_init(&tc->refs, 1); 1603 refcount_acquire(&qp->refs); 1604 return (&tc->nc); 1605 } 1606 1607 static void 1608 tcp_release_capsule(struct nvmf_tcp_capsule *tc) 1609 { 1610 struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair); 1611 1612 if (!refcount_release(&tc->refs)) 1613 return; 1614 1615 MPASS(tc->active_r2ts == 0); 1616 MPASS(tc->pending_r2ts == 0); 1617 1618 nvmf_tcp_free_pdu(&tc->rx_pdu); 1619 free(tc, M_NVMF_TCP); 1620 tcp_release_qpair(qp); 1621 } 1622 1623 static void 1624 tcp_free_capsule(struct nvmf_capsule *nc) 1625 { 1626 struct nvmf_tcp_capsule *tc = TCAP(nc); 1627 1628 tcp_release_capsule(tc); 1629 } 1630 1631 static int 1632 tcp_transmit_capsule(struct nvmf_capsule *nc) 1633 { 1634 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); 1635 struct nvmf_tcp_capsule *tc = TCAP(nc); 1636 struct socket *so = qp->so; 1637 1638 refcount_acquire(&tc->refs); 1639 SOCKBUF_LOCK(&so->so_snd); 1640 STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link); 1641 if (sowriteable(so)) 1642 cv_signal(&qp->tx_cv); 1643 SOCKBUF_UNLOCK(&so->so_snd); 1644 return (0); 1645 } 1646 1647 static uint8_t 1648 tcp_validate_command_capsule(struct nvmf_capsule *nc) 1649 { 1650 struct nvmf_tcp_capsule *tc = TCAP(nc); 1651 struct nvme_sgl_descriptor *sgl; 1652 1653 KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received")); 1654 1655 sgl = &nc->nc_sqe.sgl; 1656 switch (sgl->type) { 1657 case NVME_SGL_TYPE_ICD: 1658 if (tc->rx_pdu.data_len != le32toh(sgl->length)) { 1659 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); 1660 return (NVME_SC_DATA_SGL_LENGTH_INVALID); 1661 } 1662 break; 1663 case NVME_SGL_TYPE_COMMAND_BUFFER: 1664 if (tc->rx_pdu.data_len != 0) { 1665 printf("NVMe/TCP: Command Buffer SGL with ICD\n"); 1666 return (NVME_SC_INVALID_FIELD); 1667 } 1668 break; 1669 default: 1670 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); 1671 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); 1672 } 1673 1674 if (sgl->address != 0) { 1675 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); 1676 return (NVME_SC_SGL_OFFSET_INVALID); 1677 } 1678 1679 return (NVME_SC_SUCCESS); 1680 } 1681 1682 static size_t 1683 tcp_capsule_data_len(const struct nvmf_capsule *nc) 1684 { 1685 MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); 1686 return (le32toh(nc->nc_sqe.sgl.length)); 1687 } 1688 1689 static void 1690 tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, 1691 struct nvmf_io_request *io) 1692 { 1693 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); 1694 struct nvmf_tcp_capsule *tc = TCAP(nc); 1695 struct nvmf_tcp_command_buffer *cb; 1696 1697 cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len, 1698 nc->nc_sqe.cid); 1699 1700 cb->tc = tc; 1701 refcount_acquire(&tc->refs); 1702 1703 /* 1704 * If this command has too many active R2Ts or there are no 1705 * available transfer tags, queue the request for later. 1706 * 1707 * NB: maxr2t is 0's based. 1708 */ 1709 mtx_lock(&qp->rx_buffers.lock); 1710 if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) { 1711 #ifdef INVARIANTS 1712 tc->pending_r2ts++; 1713 #endif 1714 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); 1715 mtx_unlock(&qp->rx_buffers.lock); 1716 return; 1717 } 1718 1719 nvmf_tcp_allocate_ttag(qp, cb); 1720 mtx_unlock(&qp->rx_buffers.lock); 1721 1722 tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); 1723 } 1724 1725 static void 1726 tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, 1727 struct nvmf_io_request *io) 1728 { 1729 struct nvmf_tcp_capsule *tc = TCAP(nc); 1730 1731 mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset, 1732 io->io_len, io, 0); 1733 nvmf_complete_io_request(io, io->io_len, 0); 1734 } 1735 1736 static int 1737 tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 1738 struct nvmf_io_request *io) 1739 { 1740 struct nvme_sgl_descriptor *sgl; 1741 size_t data_len; 1742 1743 if (nc->nc_qe_len != sizeof(struct nvme_command) || 1744 !nc->nc_qpair->nq_controller) 1745 return (EINVAL); 1746 1747 sgl = &nc->nc_sqe.sgl; 1748 data_len = le32toh(sgl->length); 1749 if (data_offset + io->io_len > data_len) 1750 return (EFBIG); 1751 1752 if (sgl->type == NVME_SGL_TYPE_ICD) 1753 tcp_receive_icd_data(nc, data_offset, io); 1754 else 1755 tcp_receive_r2t_data(nc, data_offset, io); 1756 return (0); 1757 } 1758 1759 /* NB: cid is little-endian already. */ 1760 static void 1761 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset, 1762 struct mbuf *m, size_t len, bool last_pdu, bool success) 1763 { 1764 struct nvme_tcp_c2h_data_hdr c2h; 1765 struct mbuf *top; 1766 1767 memset(&c2h, 0, sizeof(c2h)); 1768 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; 1769 if (last_pdu) 1770 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; 1771 if (success) 1772 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; 1773 c2h.cccid = cid; 1774 c2h.datao = htole32(data_offset); 1775 c2h.datal = htole32(len); 1776 1777 top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len); 1778 nvmf_tcp_write_pdu(qp, top); 1779 } 1780 1781 static u_int 1782 tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 1783 struct mbuf *m, size_t len) 1784 { 1785 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); 1786 struct nvme_sgl_descriptor *sgl; 1787 struct mbuf *n, *p; 1788 uint32_t data_len; 1789 bool last_pdu, last_xfer; 1790 1791 if (nc->nc_qe_len != sizeof(struct nvme_command) || 1792 !qp->qp.nq_controller) { 1793 m_freem(m); 1794 return (NVME_SC_INVALID_FIELD); 1795 } 1796 1797 sgl = &nc->nc_sqe.sgl; 1798 data_len = le32toh(sgl->length); 1799 if (data_offset + len > data_len) { 1800 m_freem(m); 1801 return (NVME_SC_INVALID_FIELD); 1802 } 1803 last_xfer = (data_offset + len == data_len); 1804 1805 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { 1806 m_freem(m); 1807 return (NVME_SC_INVALID_FIELD); 1808 } 1809 1810 KASSERT(data_offset == TCAP(nc)->tx_data_offset, 1811 ("%s: starting data_offset %u doesn't match end of previous xfer %u", 1812 __func__, data_offset, TCAP(nc)->tx_data_offset)); 1813 1814 /* Queue one more C2H_DATA PDUs containing the data from 'm'. */ 1815 while (m != NULL) { 1816 uint32_t todo; 1817 1818 todo = m->m_len; 1819 p = m; 1820 n = p->m_next; 1821 while (n != NULL) { 1822 if (todo + n->m_len > qp->max_tx_data) { 1823 p->m_next = NULL; 1824 break; 1825 } 1826 todo += n->m_len; 1827 p = n; 1828 n = p->m_next; 1829 } 1830 MPASS(m_length(m, NULL) == todo); 1831 1832 last_pdu = (n == NULL && last_xfer); 1833 tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, 1834 last_pdu, last_pdu && qp->send_success); 1835 1836 data_offset += todo; 1837 data_len -= todo; 1838 m = n; 1839 } 1840 MPASS(data_len == 0); 1841 1842 #ifdef INVARIANTS 1843 TCAP(nc)->tx_data_offset = data_offset; 1844 #endif 1845 if (!last_xfer) 1846 return (NVMF_MORE); 1847 else if (qp->send_success) 1848 return (NVMF_SUCCESS_SENT); 1849 else 1850 return (NVME_SC_SUCCESS); 1851 } 1852 1853 struct nvmf_transport_ops tcp_ops = { 1854 .allocate_qpair = tcp_allocate_qpair, 1855 .free_qpair = tcp_free_qpair, 1856 .allocate_capsule = tcp_allocate_capsule, 1857 .free_capsule = tcp_free_capsule, 1858 .transmit_capsule = tcp_transmit_capsule, 1859 .validate_command_capsule = tcp_validate_command_capsule, 1860 .capsule_data_len = tcp_capsule_data_len, 1861 .receive_controller_data = tcp_receive_controller_data, 1862 .send_controller_data = tcp_send_controller_data, 1863 .trtype = NVMF_TRTYPE_TCP, 1864 .priority = 0, 1865 }; 1866 1867 NVMF_TRANSPORT(tcp, tcp_ops); 1868