1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/capsicum.h> 10 #include <sys/condvar.h> 11 #include <sys/file.h> 12 #include <sys/gsb_crc32.h> 13 #include <sys/kernel.h> 14 #include <sys/kthread.h> 15 #include <sys/limits.h> 16 #include <sys/lock.h> 17 #include <sys/malloc.h> 18 #include <sys/mbuf.h> 19 #include <sys/module.h> 20 #include <sys/mutex.h> 21 #include <sys/protosw.h> 22 #include <sys/refcount.h> 23 #include <sys/socket.h> 24 #include <sys/socketvar.h> 25 #include <sys/sysctl.h> 26 #include <sys/uio.h> 27 #include <netinet/in.h> 28 #include <dev/nvme/nvme.h> 29 #include <dev/nvmf/nvmf.h> 30 #include <dev/nvmf/nvmf_proto.h> 31 #include <dev/nvmf/nvmf_tcp.h> 32 #include <dev/nvmf/nvmf_transport.h> 33 #include <dev/nvmf/nvmf_transport_internal.h> 34 35 struct nvmf_tcp_capsule; 36 struct nvmf_tcp_qpair; 37 38 struct nvmf_tcp_command_buffer { 39 struct nvmf_tcp_qpair *qp; 40 41 struct nvmf_io_request io; 42 size_t data_len; 43 size_t data_xfered; 44 uint32_t data_offset; 45 46 u_int refs; 47 int error; 48 49 uint16_t cid; 50 uint16_t ttag; 51 52 TAILQ_ENTRY(nvmf_tcp_command_buffer) link; 53 54 /* Controller only */ 55 struct nvmf_tcp_capsule *tc; 56 }; 57 58 struct nvmf_tcp_command_buffer_list { 59 TAILQ_HEAD(, nvmf_tcp_command_buffer) head; 60 struct mtx lock; 61 }; 62 63 struct nvmf_tcp_qpair { 64 struct nvmf_qpair qp; 65 66 struct socket *so; 67 68 volatile u_int refs; /* Every allocated capsule holds a reference */ 69 uint8_t txpda; 70 uint8_t rxpda; 71 bool header_digests; 72 bool data_digests; 73 uint32_t maxr2t; 74 uint32_t maxh2cdata; /* Controller only */ 75 uint32_t max_tx_data; 76 uint32_t max_icd; /* Host only */ 77 uint16_t next_ttag; /* Controller only */ 78 u_int num_ttags; /* Controller only */ 79 u_int active_ttags; /* Controller only */ 80 bool send_success; /* Controller only */ 81 82 /* Receive state. */ 83 struct thread *rx_thread; 84 struct cv rx_cv; 85 bool rx_shutdown; 86 87 /* Transmit state. */ 88 struct thread *tx_thread; 89 struct cv tx_cv; 90 bool tx_shutdown; 91 struct mbufq tx_pdus; 92 STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules; 93 94 struct nvmf_tcp_command_buffer_list tx_buffers; 95 struct nvmf_tcp_command_buffer_list rx_buffers; 96 97 /* 98 * For the controller, an RX command buffer can be in one of 99 * two locations, all protected by the rx_buffers.lock. If a 100 * receive request is waiting for either an R2T slot for its 101 * command (due to exceeding MAXR2T), or a transfer tag it is 102 * placed on the rx_buffers list. When a request is allocated 103 * an active transfer tag, it moves to the open_ttags[] array 104 * (indexed by the tag) until it completes. 105 */ 106 struct nvmf_tcp_command_buffer **open_ttags; /* Controller only */ 107 }; 108 109 struct nvmf_tcp_rxpdu { 110 struct mbuf *m; 111 const struct nvme_tcp_common_pdu_hdr *hdr; 112 uint32_t data_len; 113 bool data_digest_mismatch; 114 }; 115 116 struct nvmf_tcp_capsule { 117 struct nvmf_capsule nc; 118 119 volatile u_int refs; 120 121 struct nvmf_tcp_rxpdu rx_pdu; 122 123 uint32_t active_r2ts; /* Controller only */ 124 #ifdef INVARIANTS 125 uint32_t tx_data_offset; /* Controller only */ 126 u_int pending_r2ts; /* Controller only */ 127 #endif 128 129 STAILQ_ENTRY(nvmf_tcp_capsule) link; 130 }; 131 132 #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc)) 133 #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp)) 134 135 static void tcp_release_capsule(struct nvmf_tcp_capsule *tc); 136 static void tcp_free_qpair(struct nvmf_qpair *nq); 137 138 SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 139 "TCP transport"); 140 static u_int tcp_max_transmit_data = 256 * 1024; 141 SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN, 142 &tcp_max_transmit_data, 0, 143 "Maximum size of data payload in a transmitted PDU"); 144 145 static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP"); 146 147 static int 148 mbuf_crc32c_helper(void *arg, void *data, u_int len) 149 { 150 uint32_t *digestp = arg; 151 152 *digestp = calculate_crc32c(*digestp, data, len); 153 return (0); 154 } 155 156 static uint32_t 157 mbuf_crc32c(struct mbuf *m, u_int offset, u_int len) 158 { 159 uint32_t digest = 0xffffffff; 160 161 m_apply(m, offset, len, mbuf_crc32c_helper, &digest); 162 digest = digest ^ 0xffffffff; 163 164 return (digest); 165 } 166 167 static uint32_t 168 compute_digest(const void *buf, size_t len) 169 { 170 return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); 171 } 172 173 static struct nvmf_tcp_command_buffer * 174 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, 175 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, 176 uint16_t cid) 177 { 178 struct nvmf_tcp_command_buffer *cb; 179 180 cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK); 181 cb->qp = qp; 182 cb->io = *io; 183 cb->data_offset = data_offset; 184 cb->data_len = data_len; 185 cb->data_xfered = 0; 186 refcount_init(&cb->refs, 1); 187 cb->error = 0; 188 cb->cid = cid; 189 cb->ttag = 0; 190 cb->tc = NULL; 191 192 return (cb); 193 } 194 195 static void 196 tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb) 197 { 198 refcount_acquire(&cb->refs); 199 } 200 201 static void 202 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb) 203 { 204 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); 205 if (cb->tc != NULL) 206 tcp_release_capsule(cb->tc); 207 free(cb, M_NVMF_TCP); 208 } 209 210 static void 211 tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb) 212 { 213 if (refcount_release(&cb->refs)) 214 tcp_free_command_buffer(cb); 215 } 216 217 static void 218 tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list, 219 struct nvmf_tcp_command_buffer *cb) 220 { 221 mtx_assert(&list->lock, MA_OWNED); 222 TAILQ_INSERT_HEAD(&list->head, cb, link); 223 } 224 225 static struct nvmf_tcp_command_buffer * 226 tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list, 227 uint16_t cid, uint16_t ttag) 228 { 229 struct nvmf_tcp_command_buffer *cb; 230 231 mtx_assert(&list->lock, MA_OWNED); 232 TAILQ_FOREACH(cb, &list->head, link) { 233 if (cb->cid == cid && cb->ttag == ttag) 234 return (cb); 235 } 236 return (NULL); 237 } 238 239 static void 240 tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list, 241 struct nvmf_tcp_command_buffer *cb) 242 { 243 mtx_assert(&list->lock, MA_OWNED); 244 TAILQ_REMOVE(&list->head, cb, link); 245 } 246 247 static void 248 tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list, 249 uint16_t cid, uint16_t ttag) 250 { 251 struct nvmf_tcp_command_buffer *cb; 252 253 mtx_lock(&list->lock); 254 cb = tcp_find_command_buffer(list, cid, ttag); 255 if (cb != NULL) { 256 tcp_remove_command_buffer(list, cb); 257 mtx_unlock(&list->lock); 258 tcp_release_command_buffer(cb); 259 } else 260 mtx_unlock(&list->lock); 261 } 262 263 static void 264 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m) 265 { 266 struct socket *so = qp->so; 267 268 SOCKBUF_LOCK(&so->so_snd); 269 mbufq_enqueue(&qp->tx_pdus, m); 270 /* XXX: Do we need to handle sb_hiwat being wrong? */ 271 if (sowriteable(so)) 272 cv_signal(&qp->tx_cv); 273 SOCKBUF_UNLOCK(&so->so_snd); 274 } 275 276 static void 277 nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei, 278 struct mbuf *rx_pdu, u_int hlen) 279 { 280 struct nvme_tcp_term_req_hdr *hdr; 281 struct mbuf *m; 282 283 if (hlen != 0) { 284 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); 285 hlen = min(hlen, m_length(rx_pdu, NULL)); 286 } 287 288 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0); 289 m->m_len = sizeof(*hdr) + hlen; 290 hdr = mtod(m, void *); 291 memset(hdr, 0, sizeof(*hdr)); 292 hdr->common.pdu_type = qp->qp.nq_controller ? 293 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; 294 hdr->common.hlen = sizeof(*hdr); 295 hdr->common.plen = sizeof(*hdr) + hlen; 296 hdr->fes = htole16(fes); 297 le32enc(hdr->fei, fei); 298 if (hlen != 0) 299 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); 300 301 nvmf_tcp_write_pdu(qp, m); 302 } 303 304 static int 305 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 306 { 307 const struct nvme_tcp_common_pdu_hdr *ch; 308 struct mbuf *m = pdu->m; 309 uint32_t data_len, fei, plen; 310 uint32_t digest, rx_digest; 311 u_int hlen; 312 int error; 313 uint16_t fes; 314 315 /* Determine how large of a PDU header to return for errors. */ 316 ch = pdu->hdr; 317 hlen = ch->hlen; 318 plen = le32toh(ch->plen); 319 if (hlen < sizeof(*ch) || hlen > plen) 320 hlen = sizeof(*ch); 321 322 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, 323 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, 324 &fei); 325 if (error != 0) { 326 if (error != ECONNRESET) 327 nvmf_tcp_report_error(qp, fes, fei, m, hlen); 328 return (error); 329 } 330 331 /* Check header digest if present. */ 332 if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) { 333 digest = mbuf_crc32c(m, 0, ch->hlen); 334 m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest); 335 if (digest != rx_digest) { 336 printf("NVMe/TCP: Header digest mismatch\n"); 337 nvmf_tcp_report_error(qp, 338 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, 339 hlen); 340 return (EBADMSG); 341 } 342 } 343 344 /* Check data digest if present. */ 345 pdu->data_digest_mismatch = false; 346 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { 347 digest = mbuf_crc32c(m, ch->pdo, data_len); 348 m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest), 349 (caddr_t)&rx_digest); 350 if (digest != rx_digest) { 351 printf("NVMe/TCP: Data digest mismatch\n"); 352 pdu->data_digest_mismatch = true; 353 } 354 } 355 356 pdu->data_len = data_len; 357 return (0); 358 } 359 360 static void 361 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu) 362 { 363 m_freem(pdu->m); 364 pdu->m = NULL; 365 pdu->hdr = NULL; 366 } 367 368 static int 369 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu) 370 { 371 const struct nvme_tcp_term_req_hdr *hdr; 372 373 hdr = (const void *)pdu->hdr; 374 375 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", 376 le16toh(hdr->fes), le32dec(hdr->fei)); 377 nvmf_tcp_free_pdu(pdu); 378 return (ECONNRESET); 379 } 380 381 static int 382 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp, 383 struct nvmf_tcp_rxpdu *pdu) 384 { 385 const struct nvme_tcp_cmd *cmd; 386 struct nvmf_capsule *nc; 387 struct nvmf_tcp_capsule *tc; 388 389 cmd = (const void *)pdu->hdr; 390 391 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); 392 393 tc = TCAP(nc); 394 tc->rx_pdu = *pdu; 395 396 nvmf_capsule_received(&qp->qp, nc); 397 return (0); 398 } 399 400 static int 401 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp, 402 struct nvmf_tcp_rxpdu *pdu) 403 { 404 const struct nvme_tcp_rsp *rsp; 405 struct nvmf_capsule *nc; 406 struct nvmf_tcp_capsule *tc; 407 408 rsp = (const void *)pdu->hdr; 409 410 nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK); 411 412 nc->nc_sqhd_valid = true; 413 tc = TCAP(nc); 414 tc->rx_pdu = *pdu; 415 416 /* 417 * Once the CQE has been received, no further transfers to the 418 * command buffer for the associated CID can occur. 419 */ 420 tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0); 421 tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0); 422 423 nvmf_capsule_received(&qp->qp, nc); 424 return (0); 425 } 426 427 /* 428 * Construct a PDU that contains an optional data payload. This 429 * includes dealing with digests and the length fields in the common 430 * header. 431 */ 432 static struct mbuf * 433 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen, 434 struct mbuf *data, uint32_t data_len) 435 { 436 struct nvme_tcp_common_pdu_hdr *ch; 437 struct mbuf *top; 438 uint32_t digest, pad, pdo, plen, mlen; 439 440 plen = hlen; 441 if (qp->header_digests) 442 plen += sizeof(digest); 443 if (data_len != 0) { 444 KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); 445 pdo = roundup2(plen, qp->txpda); 446 pad = pdo - plen; 447 plen = pdo + data_len; 448 if (qp->data_digests) 449 plen += sizeof(digest); 450 mlen = pdo; 451 } else { 452 KASSERT(data == NULL, ("payload mbuf with zero length")); 453 pdo = 0; 454 pad = 0; 455 mlen = plen; 456 } 457 458 top = m_get2(mlen, M_WAITOK, MT_DATA, 0); 459 top->m_len = mlen; 460 ch = mtod(top, void *); 461 memcpy(ch, hdr, hlen); 462 ch->hlen = hlen; 463 if (qp->header_digests) 464 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; 465 if (qp->data_digests && data_len != 0) 466 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; 467 ch->pdo = pdo; 468 ch->plen = htole32(plen); 469 470 /* HDGST */ 471 if (qp->header_digests) { 472 digest = compute_digest(ch, hlen); 473 memcpy((char *)ch + hlen, &digest, sizeof(digest)); 474 } 475 476 if (pad != 0) { 477 /* PAD */ 478 memset((char *)ch + pdo - pad, 0, pad); 479 } 480 481 if (data_len != 0) { 482 /* DATA */ 483 top->m_next = data; 484 485 /* DDGST */ 486 if (qp->data_digests) { 487 digest = mbuf_crc32c(data, 0, data_len); 488 489 /* XXX: Can't use m_append as it uses M_NOWAIT. */ 490 while (data->m_next != NULL) 491 data = data->m_next; 492 493 data->m_next = m_get(M_WAITOK, MT_DATA); 494 data->m_next->m_len = sizeof(digest); 495 memcpy(mtod(data->m_next, void *), &digest, 496 sizeof(digest)); 497 } 498 } 499 500 return (top); 501 } 502 503 /* Find the next command buffer eligible to schedule for R2T. */ 504 static struct nvmf_tcp_command_buffer * 505 nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp) 506 { 507 struct nvmf_tcp_command_buffer *cb; 508 509 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 510 MPASS(qp->active_ttags < qp->num_ttags); 511 512 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { 513 /* NB: maxr2t is 0's based. */ 514 if (cb->tc->active_r2ts > qp->maxr2t) 515 continue; 516 #ifdef INVARIANTS 517 cb->tc->pending_r2ts--; 518 #endif 519 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); 520 return (cb); 521 } 522 return (NULL); 523 } 524 525 /* Allocate the next free transfer tag and assign it to cb. */ 526 static void 527 nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp, 528 struct nvmf_tcp_command_buffer *cb) 529 { 530 uint16_t ttag; 531 532 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 533 534 ttag = qp->next_ttag; 535 for (;;) { 536 if (qp->open_ttags[ttag] == NULL) 537 break; 538 if (ttag == qp->num_ttags - 1) 539 ttag = 0; 540 else 541 ttag++; 542 MPASS(ttag != qp->next_ttag); 543 } 544 if (ttag == qp->num_ttags - 1) 545 qp->next_ttag = 0; 546 else 547 qp->next_ttag = ttag + 1; 548 549 cb->tc->active_r2ts++; 550 qp->active_ttags++; 551 qp->open_ttags[ttag] = cb; 552 553 /* 554 * Don't bother byte-swapping ttag as it is just a cookie 555 * value returned by the other end as-is. 556 */ 557 cb->ttag = ttag; 558 } 559 560 /* NB: cid and ttag are both little-endian already. */ 561 static void 562 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, 563 uint32_t data_offset, uint32_t data_len) 564 { 565 struct nvme_tcp_r2t_hdr r2t; 566 struct mbuf *m; 567 568 memset(&r2t, 0, sizeof(r2t)); 569 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; 570 r2t.cccid = cid; 571 r2t.ttag = ttag; 572 r2t.r2to = htole32(data_offset); 573 r2t.r2tl = htole32(data_len); 574 575 m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); 576 nvmf_tcp_write_pdu(qp, m); 577 } 578 579 /* 580 * Release a transfer tag and schedule another R2T. 581 * 582 * NB: This drops the rx_buffers.lock mutex. 583 */ 584 static void 585 nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp, 586 struct nvmf_tcp_command_buffer *cb) 587 { 588 struct nvmf_tcp_command_buffer *ncb; 589 590 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 591 MPASS(qp->open_ttags[cb->ttag] == cb); 592 593 /* Release this transfer tag. */ 594 qp->open_ttags[cb->ttag] = NULL; 595 qp->active_ttags--; 596 cb->tc->active_r2ts--; 597 598 /* Schedule another R2T. */ 599 ncb = nvmf_tcp_next_r2t(qp); 600 if (ncb != NULL) { 601 nvmf_tcp_allocate_ttag(qp, ncb); 602 mtx_unlock(&qp->rx_buffers.lock); 603 tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, 604 ncb->data_len); 605 } else 606 mtx_unlock(&qp->rx_buffers.lock); 607 } 608 609 /* 610 * Copy len bytes starting at offset skip from an mbuf chain into an 611 * I/O buffer at destination offset io_offset. 612 */ 613 static void 614 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, 615 struct nvmf_io_request *io, u_int io_offset) 616 { 617 u_int todo; 618 619 while (m->m_len <= skip) { 620 skip -= m->m_len; 621 m = m->m_next; 622 } 623 while (len != 0) { 624 MPASS((m->m_flags & M_EXTPG) == 0); 625 626 todo = min(m->m_len - skip, len); 627 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); 628 skip = 0; 629 io_offset += todo; 630 len -= todo; 631 m = m->m_next; 632 } 633 } 634 635 static int 636 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 637 { 638 const struct nvme_tcp_h2c_data_hdr *h2c; 639 struct nvmf_tcp_command_buffer *cb; 640 uint32_t data_len, data_offset; 641 uint16_t ttag; 642 643 h2c = (const void *)pdu->hdr; 644 if (le32toh(h2c->datal) > qp->maxh2cdata) { 645 nvmf_tcp_report_error(qp, 646 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, 647 pdu->m, pdu->hdr->hlen); 648 nvmf_tcp_free_pdu(pdu); 649 return (EBADMSG); 650 } 651 652 /* 653 * NB: Don't bother byte-swapping ttag as we don't byte-swap 654 * it when sending. 655 */ 656 ttag = h2c->ttag; 657 if (ttag >= qp->num_ttags) { 658 nvmf_tcp_report_error(qp, 659 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 660 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, 661 pdu->hdr->hlen); 662 nvmf_tcp_free_pdu(pdu); 663 return (EBADMSG); 664 } 665 666 mtx_lock(&qp->rx_buffers.lock); 667 cb = qp->open_ttags[ttag]; 668 if (cb == NULL) { 669 mtx_unlock(&qp->rx_buffers.lock); 670 nvmf_tcp_report_error(qp, 671 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 672 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, 673 pdu->hdr->hlen); 674 nvmf_tcp_free_pdu(pdu); 675 return (EBADMSG); 676 } 677 MPASS(cb->ttag == ttag); 678 679 /* For a data digest mismatch, fail the I/O request. */ 680 if (pdu->data_digest_mismatch) { 681 nvmf_tcp_send_next_r2t(qp, cb); 682 cb->error = EINTEGRITY; 683 tcp_release_command_buffer(cb); 684 nvmf_tcp_free_pdu(pdu); 685 return (0); 686 } 687 688 data_len = le32toh(h2c->datal); 689 if (data_len != pdu->data_len) { 690 mtx_unlock(&qp->rx_buffers.lock); 691 nvmf_tcp_report_error(qp, 692 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 693 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, 694 pdu->hdr->hlen); 695 nvmf_tcp_free_pdu(pdu); 696 return (EBADMSG); 697 } 698 699 data_offset = le32toh(h2c->datao); 700 if (data_offset < cb->data_offset || 701 data_offset + data_len > cb->data_offset + cb->data_len) { 702 mtx_unlock(&qp->rx_buffers.lock); 703 nvmf_tcp_report_error(qp, 704 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, 705 pdu->hdr->hlen); 706 nvmf_tcp_free_pdu(pdu); 707 return (EBADMSG); 708 } 709 710 if (data_offset != cb->data_offset + cb->data_xfered) { 711 mtx_unlock(&qp->rx_buffers.lock); 712 nvmf_tcp_report_error(qp, 713 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 714 pdu->hdr->hlen); 715 nvmf_tcp_free_pdu(pdu); 716 return (EBADMSG); 717 } 718 719 if ((cb->data_xfered + data_len == cb->data_len) != 720 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { 721 mtx_unlock(&qp->rx_buffers.lock); 722 nvmf_tcp_report_error(qp, 723 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 724 pdu->hdr->hlen); 725 nvmf_tcp_free_pdu(pdu); 726 return (EBADMSG); 727 } 728 729 cb->data_xfered += data_len; 730 data_offset -= cb->data_offset; 731 if (cb->data_xfered == cb->data_len) { 732 nvmf_tcp_send_next_r2t(qp, cb); 733 } else { 734 tcp_hold_command_buffer(cb); 735 mtx_unlock(&qp->rx_buffers.lock); 736 } 737 738 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset); 739 740 tcp_release_command_buffer(cb); 741 nvmf_tcp_free_pdu(pdu); 742 return (0); 743 } 744 745 static int 746 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 747 { 748 const struct nvme_tcp_c2h_data_hdr *c2h; 749 struct nvmf_tcp_command_buffer *cb; 750 uint32_t data_len, data_offset; 751 752 c2h = (const void *)pdu->hdr; 753 754 mtx_lock(&qp->rx_buffers.lock); 755 cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0); 756 if (cb == NULL) { 757 mtx_unlock(&qp->rx_buffers.lock); 758 /* 759 * XXX: Could be PDU sequence error if cccid is for a 760 * command that doesn't use a command buffer. 761 */ 762 nvmf_tcp_report_error(qp, 763 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 764 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, 765 pdu->hdr->hlen); 766 nvmf_tcp_free_pdu(pdu); 767 return (EBADMSG); 768 } 769 770 /* For a data digest mismatch, fail the I/O request. */ 771 if (pdu->data_digest_mismatch) { 772 cb->error = EINTEGRITY; 773 tcp_remove_command_buffer(&qp->rx_buffers, cb); 774 mtx_unlock(&qp->rx_buffers.lock); 775 tcp_release_command_buffer(cb); 776 nvmf_tcp_free_pdu(pdu); 777 return (0); 778 } 779 780 data_len = le32toh(c2h->datal); 781 if (data_len != pdu->data_len) { 782 mtx_unlock(&qp->rx_buffers.lock); 783 nvmf_tcp_report_error(qp, 784 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 785 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, 786 pdu->hdr->hlen); 787 nvmf_tcp_free_pdu(pdu); 788 return (EBADMSG); 789 } 790 791 data_offset = le32toh(c2h->datao); 792 if (data_offset < cb->data_offset || 793 data_offset + data_len > cb->data_offset + cb->data_len) { 794 mtx_unlock(&qp->rx_buffers.lock); 795 nvmf_tcp_report_error(qp, 796 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 797 pdu->m, pdu->hdr->hlen); 798 nvmf_tcp_free_pdu(pdu); 799 return (EBADMSG); 800 } 801 802 if (data_offset != cb->data_offset + cb->data_xfered) { 803 mtx_unlock(&qp->rx_buffers.lock); 804 nvmf_tcp_report_error(qp, 805 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 806 pdu->hdr->hlen); 807 nvmf_tcp_free_pdu(pdu); 808 return (EBADMSG); 809 } 810 811 if ((cb->data_xfered + data_len == cb->data_len) != 812 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { 813 mtx_unlock(&qp->rx_buffers.lock); 814 nvmf_tcp_report_error(qp, 815 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 816 pdu->hdr->hlen); 817 nvmf_tcp_free_pdu(pdu); 818 return (EBADMSG); 819 } 820 821 cb->data_xfered += data_len; 822 data_offset -= cb->data_offset; 823 if (cb->data_xfered == cb->data_len) 824 tcp_remove_command_buffer(&qp->rx_buffers, cb); 825 else 826 tcp_hold_command_buffer(cb); 827 mtx_unlock(&qp->rx_buffers.lock); 828 829 mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset); 830 831 tcp_release_command_buffer(cb); 832 833 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 834 struct nvme_completion cqe; 835 struct nvmf_capsule *nc; 836 837 memset(&cqe, 0, sizeof(cqe)); 838 cqe.cid = c2h->cccid; 839 840 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); 841 nc->nc_sqhd_valid = false; 842 843 nvmf_capsule_received(&qp->qp, nc); 844 } 845 846 nvmf_tcp_free_pdu(pdu); 847 return (0); 848 } 849 850 /* Called when m_free drops refcount to 0. */ 851 static void 852 nvmf_tcp_mbuf_done(struct mbuf *m) 853 { 854 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1; 855 856 tcp_free_command_buffer(cb); 857 } 858 859 static struct mbuf * 860 nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len) 861 { 862 struct nvmf_tcp_command_buffer *cb = arg; 863 struct mbuf *m; 864 865 m = m_get(how, MT_DATA); 866 m->m_flags |= M_RDONLY; 867 m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL); 868 m->m_len = len; 869 return (m); 870 } 871 872 static void 873 nvmf_tcp_free_mext_pg(struct mbuf *m) 874 { 875 struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1; 876 877 M_ASSERTEXTPG(m); 878 tcp_release_command_buffer(cb); 879 } 880 881 static struct mbuf * 882 nvmf_tcp_mext_pg(void *arg, int how) 883 { 884 struct nvmf_tcp_command_buffer *cb = arg; 885 struct mbuf *m; 886 887 m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg); 888 m->m_ext.ext_arg1 = cb; 889 tcp_hold_command_buffer(cb); 890 return (m); 891 } 892 893 /* 894 * Return an mbuf chain for a range of data belonging to a command 895 * buffer. 896 * 897 * The mbuf chain uses M_EXT mbufs which hold references on the 898 * command buffer so that it remains "alive" until the data has been 899 * fully transmitted. If truncate_ok is true, then the mbuf chain 900 * might return a short chain to avoid gratuitously splitting up a 901 * page. 902 */ 903 static struct mbuf * 904 nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb, 905 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, 906 bool can_truncate) 907 { 908 struct mbuf *m; 909 size_t len; 910 911 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf, 912 nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, 913 can_truncate); 914 if (actual_len != NULL) 915 *actual_len = len; 916 return (m); 917 } 918 919 /* NB: cid and ttag and little-endian already. */ 920 static void 921 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, 922 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) 923 { 924 struct nvme_tcp_h2c_data_hdr h2c; 925 struct mbuf *top; 926 927 memset(&h2c, 0, sizeof(h2c)); 928 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; 929 if (last_pdu) 930 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; 931 h2c.cccid = cid; 932 h2c.ttag = ttag; 933 h2c.datao = htole32(data_offset); 934 h2c.datal = htole32(len); 935 936 top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len); 937 nvmf_tcp_write_pdu(qp, top); 938 } 939 940 static int 941 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) 942 { 943 const struct nvme_tcp_r2t_hdr *r2t; 944 struct nvmf_tcp_command_buffer *cb; 945 uint32_t data_len, data_offset; 946 947 r2t = (const void *)pdu->hdr; 948 949 mtx_lock(&qp->tx_buffers.lock); 950 cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0); 951 if (cb == NULL) { 952 mtx_unlock(&qp->tx_buffers.lock); 953 nvmf_tcp_report_error(qp, 954 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 955 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, 956 pdu->hdr->hlen); 957 nvmf_tcp_free_pdu(pdu); 958 return (EBADMSG); 959 } 960 961 data_offset = le32toh(r2t->r2to); 962 if (data_offset != cb->data_xfered) { 963 mtx_unlock(&qp->tx_buffers.lock); 964 nvmf_tcp_report_error(qp, 965 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 966 pdu->hdr->hlen); 967 nvmf_tcp_free_pdu(pdu); 968 return (EBADMSG); 969 } 970 971 /* 972 * XXX: The spec does not specify how to handle R2T tranfers 973 * out of range of the original command. 974 */ 975 data_len = le32toh(r2t->r2tl); 976 if (data_offset + data_len > cb->data_len) { 977 mtx_unlock(&qp->tx_buffers.lock); 978 nvmf_tcp_report_error(qp, 979 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 980 pdu->m, pdu->hdr->hlen); 981 nvmf_tcp_free_pdu(pdu); 982 return (EBADMSG); 983 } 984 985 cb->data_xfered += data_len; 986 if (cb->data_xfered == cb->data_len) 987 tcp_remove_command_buffer(&qp->tx_buffers, cb); 988 else 989 tcp_hold_command_buffer(cb); 990 mtx_unlock(&qp->tx_buffers.lock); 991 992 /* 993 * Queue one or more H2C_DATA PDUs containing the requested 994 * data. 995 */ 996 while (data_len > 0) { 997 struct mbuf *m; 998 uint32_t sent, todo; 999 1000 todo = min(data_len, qp->max_tx_data); 1001 m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent, 1002 todo < data_len); 1003 tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, 1004 sent, sent == data_len); 1005 1006 data_offset += sent; 1007 data_len -= sent; 1008 } 1009 1010 tcp_release_command_buffer(cb); 1011 nvmf_tcp_free_pdu(pdu); 1012 return (0); 1013 } 1014 1015 /* 1016 * A variant of m_pullup that uses M_WAITOK instead of failing. It 1017 * also doesn't do anything if enough bytes are already present in the 1018 * first mbuf. 1019 */ 1020 static struct mbuf * 1021 pullup_pdu_hdr(struct mbuf *m, int len) 1022 { 1023 struct mbuf *n, *p; 1024 1025 KASSERT(len <= MCLBYTES, ("%s: len too large", __func__)); 1026 if (m->m_len >= len) 1027 return (m); 1028 1029 n = m_get2(len, M_WAITOK, MT_DATA, 0); 1030 n->m_len = len; 1031 m_copydata(m, 0, len, mtod(n, void *)); 1032 1033 while (m != NULL && m->m_len <= len) { 1034 p = m->m_next; 1035 len -= m->m_len; 1036 m_free(m); 1037 m = p; 1038 } 1039 if (len > 0) { 1040 m->m_data += len; 1041 m->m_len -= len; 1042 } 1043 n->m_next = m; 1044 return (n); 1045 } 1046 1047 static int 1048 nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp, 1049 const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu) 1050 { 1051 /* Ensure the PDU header is contiguous. */ 1052 pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen); 1053 pdu->hdr = mtod(pdu->m, const void *); 1054 1055 switch (ch->pdu_type) { 1056 default: 1057 __assert_unreachable(); 1058 break; 1059 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: 1060 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1061 return (nvmf_tcp_handle_term_req(pdu)); 1062 case NVME_TCP_PDU_TYPE_CAPSULE_CMD: 1063 return (nvmf_tcp_save_command_capsule(qp, pdu)); 1064 case NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1065 return (nvmf_tcp_save_response_capsule(qp, pdu)); 1066 case NVME_TCP_PDU_TYPE_H2C_DATA: 1067 return (nvmf_tcp_handle_h2c_data(qp, pdu)); 1068 case NVME_TCP_PDU_TYPE_C2H_DATA: 1069 return (nvmf_tcp_handle_c2h_data(qp, pdu)); 1070 case NVME_TCP_PDU_TYPE_R2T: 1071 return (nvmf_tcp_handle_r2t(qp, pdu)); 1072 } 1073 } 1074 1075 static void 1076 nvmf_tcp_receive(void *arg) 1077 { 1078 struct nvmf_tcp_qpair *qp = arg; 1079 struct socket *so = qp->so; 1080 struct nvmf_tcp_rxpdu pdu; 1081 struct nvme_tcp_common_pdu_hdr ch; 1082 struct uio uio; 1083 struct iovec iov[1]; 1084 struct mbuf *m, *n, *tail; 1085 u_int avail, needed; 1086 int error, flags, terror; 1087 bool have_header; 1088 1089 m = tail = NULL; 1090 have_header = false; 1091 SOCKBUF_LOCK(&so->so_rcv); 1092 while (!qp->rx_shutdown) { 1093 /* Wait until there is enough data for the next step. */ 1094 if (so->so_error != 0 || so->so_rerror != 0) { 1095 if (so->so_error != 0) 1096 error = so->so_error; 1097 else 1098 error = so->so_rerror; 1099 SOCKBUF_UNLOCK(&so->so_rcv); 1100 error: 1101 m_freem(m); 1102 nvmf_qpair_error(&qp->qp, error); 1103 SOCKBUF_LOCK(&so->so_rcv); 1104 while (!qp->rx_shutdown) 1105 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1106 break; 1107 } 1108 avail = sbavail(&so->so_rcv); 1109 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { 1110 if (!have_header && avail == 0) 1111 error = 0; 1112 else 1113 error = ECONNRESET; 1114 SOCKBUF_UNLOCK(&so->so_rcv); 1115 goto error; 1116 } 1117 if (avail == 0 || (!have_header && avail < sizeof(ch))) { 1118 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1119 continue; 1120 } 1121 SOCKBUF_UNLOCK(&so->so_rcv); 1122 1123 if (!have_header) { 1124 KASSERT(m == NULL, ("%s: m != NULL but no header", 1125 __func__)); 1126 memset(&uio, 0, sizeof(uio)); 1127 iov[0].iov_base = &ch; 1128 iov[0].iov_len = sizeof(ch); 1129 uio.uio_iov = iov; 1130 uio.uio_iovcnt = 1; 1131 uio.uio_resid = sizeof(ch); 1132 uio.uio_segflg = UIO_SYSSPACE; 1133 uio.uio_rw = UIO_READ; 1134 flags = MSG_DONTWAIT | MSG_PEEK; 1135 1136 error = soreceive(so, NULL, &uio, NULL, NULL, &flags); 1137 if (error != 0) 1138 goto error; 1139 KASSERT(uio.uio_resid == 0, ("%s: short CH read", 1140 __func__)); 1141 1142 have_header = true; 1143 needed = le32toh(ch.plen); 1144 1145 /* 1146 * Malformed PDUs will be reported as errors 1147 * by nvmf_tcp_validate_pdu. Just pass along 1148 * garbage headers if the lengths mismatch. 1149 */ 1150 if (needed < sizeof(ch) || ch.hlen > needed) 1151 needed = sizeof(ch); 1152 1153 memset(&uio, 0, sizeof(uio)); 1154 uio.uio_resid = needed; 1155 } 1156 1157 flags = MSG_DONTWAIT; 1158 error = soreceive(so, NULL, &uio, &n, NULL, &flags); 1159 if (error != 0) 1160 goto error; 1161 1162 if (m == NULL) 1163 m = n; 1164 else 1165 tail->m_next = n; 1166 1167 if (uio.uio_resid != 0) { 1168 tail = n; 1169 while (tail->m_next != NULL) 1170 tail = tail->m_next; 1171 1172 SOCKBUF_LOCK(&so->so_rcv); 1173 continue; 1174 } 1175 #ifdef INVARIANTS 1176 tail = NULL; 1177 #endif 1178 1179 pdu.m = m; 1180 m = NULL; 1181 pdu.hdr = &ch; 1182 error = nvmf_tcp_validate_pdu(qp, &pdu); 1183 if (error != 0) 1184 m_freem(pdu.m); 1185 else 1186 error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu); 1187 if (error != 0) { 1188 /* 1189 * If we received a termination request, close 1190 * the connection immediately. 1191 */ 1192 if (error == ECONNRESET) 1193 goto error; 1194 1195 /* 1196 * Wait for up to 30 seconds for the socket to 1197 * be closed by the other end. 1198 */ 1199 SOCKBUF_LOCK(&so->so_rcv); 1200 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1201 terror = cv_timedwait(&qp->rx_cv, 1202 SOCKBUF_MTX(&so->so_rcv), 30 * hz); 1203 if (terror == ETIMEDOUT) 1204 printf("NVMe/TCP: Timed out after sending terminate request\n"); 1205 } 1206 SOCKBUF_UNLOCK(&so->so_rcv); 1207 goto error; 1208 } 1209 1210 have_header = false; 1211 SOCKBUF_LOCK(&so->so_rcv); 1212 } 1213 SOCKBUF_UNLOCK(&so->so_rcv); 1214 kthread_exit(); 1215 } 1216 1217 static struct mbuf * 1218 tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) 1219 { 1220 struct nvmf_capsule *nc = &tc->nc; 1221 struct nvmf_tcp_command_buffer *cb; 1222 struct nvme_sgl_descriptor *sgl; 1223 struct nvme_tcp_cmd cmd; 1224 struct mbuf *top, *m; 1225 bool use_icd; 1226 1227 use_icd = false; 1228 cb = NULL; 1229 m = NULL; 1230 1231 if (nc->nc_data.io_len != 0) { 1232 cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0, 1233 nc->nc_data.io_len, nc->nc_sqe.cid); 1234 1235 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { 1236 use_icd = true; 1237 m = nvmf_tcp_command_buffer_mbuf(cb, 0, 1238 nc->nc_data.io_len, NULL, false); 1239 cb->data_xfered = nc->nc_data.io_len; 1240 tcp_release_command_buffer(cb); 1241 } else if (nc->nc_send_data) { 1242 mtx_lock(&qp->tx_buffers.lock); 1243 tcp_add_command_buffer(&qp->tx_buffers, cb); 1244 mtx_unlock(&qp->tx_buffers.lock); 1245 } else { 1246 mtx_lock(&qp->rx_buffers.lock); 1247 tcp_add_command_buffer(&qp->rx_buffers, cb); 1248 mtx_unlock(&qp->rx_buffers.lock); 1249 } 1250 } 1251 1252 memset(&cmd, 0, sizeof(cmd)); 1253 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; 1254 cmd.ccsqe = nc->nc_sqe; 1255 1256 /* Populate SGL in SQE. */ 1257 sgl = &cmd.ccsqe.sgl; 1258 memset(sgl, 0, sizeof(*sgl)); 1259 sgl->address = 0; 1260 sgl->length = htole32(nc->nc_data.io_len); 1261 if (use_icd) { 1262 /* Use in-capsule data. */ 1263 sgl->type = NVME_SGL_TYPE_ICD; 1264 } else { 1265 /* Use a command buffer. */ 1266 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; 1267 } 1268 1269 top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? 1270 nc->nc_data.io_len : 0); 1271 return (top); 1272 } 1273 1274 static struct mbuf * 1275 tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) 1276 { 1277 struct nvmf_capsule *nc = &tc->nc; 1278 struct nvme_tcp_rsp rsp; 1279 1280 memset(&rsp, 0, sizeof(rsp)); 1281 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; 1282 rsp.rccqe = nc->nc_cqe; 1283 1284 return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); 1285 } 1286 1287 static struct mbuf * 1288 capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) 1289 { 1290 if (tc->nc.nc_qe_len == sizeof(struct nvme_command)) 1291 return (tcp_command_pdu(qp, tc)); 1292 else 1293 return (tcp_response_pdu(qp, tc)); 1294 } 1295 1296 static void 1297 nvmf_tcp_send(void *arg) 1298 { 1299 struct nvmf_tcp_qpair *qp = arg; 1300 struct nvmf_tcp_capsule *tc; 1301 struct socket *so = qp->so; 1302 struct mbuf *m, *n, *p; 1303 u_long space, tosend; 1304 int error; 1305 1306 m = NULL; 1307 SOCKBUF_LOCK(&so->so_snd); 1308 while (!qp->tx_shutdown) { 1309 if (so->so_error != 0) { 1310 error = so->so_error; 1311 SOCKBUF_UNLOCK(&so->so_snd); 1312 error: 1313 m_freem(m); 1314 nvmf_qpair_error(&qp->qp, error); 1315 SOCKBUF_LOCK(&so->so_snd); 1316 while (!qp->tx_shutdown) 1317 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 1318 break; 1319 } 1320 1321 if (m == NULL) { 1322 /* Next PDU to send. */ 1323 m = mbufq_dequeue(&qp->tx_pdus); 1324 } 1325 if (m == NULL) { 1326 if (STAILQ_EMPTY(&qp->tx_capsules)) { 1327 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 1328 continue; 1329 } 1330 1331 /* Convert a capsule into a PDU. */ 1332 tc = STAILQ_FIRST(&qp->tx_capsules); 1333 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); 1334 SOCKBUF_UNLOCK(&so->so_snd); 1335 1336 n = capsule_to_pdu(qp, tc); 1337 tcp_release_capsule(tc); 1338 1339 SOCKBUF_LOCK(&so->so_snd); 1340 mbufq_enqueue(&qp->tx_pdus, n); 1341 continue; 1342 } 1343 1344 /* 1345 * Wait until there is enough room to send some data. 1346 * If the socket buffer is empty, always send at least 1347 * something. 1348 */ 1349 space = sbspace(&so->so_snd); 1350 if (space < m->m_len && sbused(&so->so_snd) != 0) { 1351 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 1352 continue; 1353 } 1354 SOCKBUF_UNLOCK(&so->so_snd); 1355 1356 /* 1357 * If 'm' is too big, then the socket buffer must be 1358 * empty. Split 'm' to make at least some forward 1359 * progress. 1360 * 1361 * Otherwise, chain up as many pending mbufs from 'm' 1362 * that will fit. 1363 */ 1364 if (m->m_len > space) { 1365 n = m_split(m, space, M_WAITOK); 1366 } else { 1367 tosend = m->m_len; 1368 n = m->m_next; 1369 p = m; 1370 while (n != NULL && tosend + n->m_len <= space) { 1371 tosend += n->m_len; 1372 p = n; 1373 n = n->m_next; 1374 } 1375 KASSERT(p->m_next == n, ("%s: p not before n", 1376 __func__)); 1377 p->m_next = NULL; 1378 1379 KASSERT(m_length(m, NULL) == tosend, 1380 ("%s: length mismatch", __func__)); 1381 } 1382 error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL); 1383 if (error != 0) { 1384 m = NULL; 1385 m_freem(n); 1386 goto error; 1387 } 1388 m = n; 1389 SOCKBUF_LOCK(&so->so_snd); 1390 } 1391 SOCKBUF_UNLOCK(&so->so_snd); 1392 kthread_exit(); 1393 } 1394 1395 static int 1396 nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag) 1397 { 1398 struct nvmf_tcp_qpair *qp = arg; 1399 1400 if (soreadable(so)) 1401 cv_signal(&qp->rx_cv); 1402 return (SU_OK); 1403 } 1404 1405 static int 1406 nvmf_soupcall_send(struct socket *so, void *arg, int waitflag) 1407 { 1408 struct nvmf_tcp_qpair *qp = arg; 1409 1410 if (sowriteable(so)) 1411 cv_signal(&qp->tx_cv); 1412 return (SU_OK); 1413 } 1414 1415 static struct nvmf_qpair * 1416 tcp_allocate_qpair(bool controller, 1417 const struct nvmf_handoff_qpair_params *params) 1418 { 1419 struct nvmf_tcp_qpair *qp; 1420 struct socket *so; 1421 struct file *fp; 1422 cap_rights_t rights; 1423 int error; 1424 1425 error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights, 1426 CAP_SOCK_CLIENT), &fp); 1427 if (error != 0) 1428 return (NULL); 1429 if (fp->f_type != DTYPE_SOCKET) { 1430 fdrop(fp, curthread); 1431 return (NULL); 1432 } 1433 so = fp->f_data; 1434 if (so->so_type != SOCK_STREAM || 1435 so->so_proto->pr_protocol != IPPROTO_TCP) { 1436 fdrop(fp, curthread); 1437 return (NULL); 1438 } 1439 1440 /* Claim socket from file descriptor. */ 1441 fp->f_ops = &badfileops; 1442 fp->f_data = NULL; 1443 fdrop(fp, curthread); 1444 1445 qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO); 1446 qp->so = so; 1447 refcount_init(&qp->refs, 1); 1448 qp->txpda = params->tcp.txpda; 1449 qp->rxpda = params->tcp.rxpda; 1450 qp->header_digests = params->tcp.header_digests; 1451 qp->data_digests = params->tcp.data_digests; 1452 qp->maxr2t = params->tcp.maxr2t; 1453 if (controller) 1454 qp->maxh2cdata = params->tcp.maxh2cdata; 1455 qp->max_tx_data = tcp_max_transmit_data; 1456 if (!controller) { 1457 qp->max_tx_data = min(qp->max_tx_data, params->tcp.maxh2cdata); 1458 qp->max_icd = params->tcp.max_icd; 1459 } 1460 1461 if (controller) { 1462 /* Use the SUCCESS flag if SQ flow control is disabled. */ 1463 qp->send_success = !params->sq_flow_control; 1464 1465 /* NB: maxr2t is 0's based. */ 1466 qp->num_ttags = MIN((u_int)UINT16_MAX + 1, 1467 (uint64_t)params->qsize * ((uint64_t)qp->maxr2t + 1)); 1468 qp->open_ttags = mallocarray(qp->num_ttags, 1469 sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO); 1470 } 1471 1472 TAILQ_INIT(&qp->rx_buffers.head); 1473 TAILQ_INIT(&qp->tx_buffers.head); 1474 mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF); 1475 mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF); 1476 1477 cv_init(&qp->rx_cv, "-"); 1478 cv_init(&qp->tx_cv, "-"); 1479 mbufq_init(&qp->tx_pdus, 0); 1480 STAILQ_INIT(&qp->tx_capsules); 1481 1482 /* Register socket upcalls. */ 1483 SOCKBUF_LOCK(&so->so_rcv); 1484 soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp); 1485 SOCKBUF_UNLOCK(&so->so_rcv); 1486 SOCKBUF_LOCK(&so->so_snd); 1487 soupcall_set(so, SO_SND, nvmf_soupcall_send, qp); 1488 SOCKBUF_UNLOCK(&so->so_snd); 1489 1490 /* Spin up kthreads. */ 1491 error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0, 1492 "nvmef tcp rx"); 1493 if (error != 0) { 1494 tcp_free_qpair(&qp->qp); 1495 return (NULL); 1496 } 1497 error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0, 1498 "nvmef tcp tx"); 1499 if (error != 0) { 1500 tcp_free_qpair(&qp->qp); 1501 return (NULL); 1502 } 1503 1504 return (&qp->qp); 1505 } 1506 1507 static void 1508 tcp_release_qpair(struct nvmf_tcp_qpair *qp) 1509 { 1510 if (refcount_release(&qp->refs)) 1511 free(qp, M_NVMF_TCP); 1512 } 1513 1514 static void 1515 tcp_free_qpair(struct nvmf_qpair *nq) 1516 { 1517 struct nvmf_tcp_qpair *qp = TQP(nq); 1518 struct nvmf_tcp_command_buffer *ncb, *cb; 1519 struct nvmf_tcp_capsule *ntc, *tc; 1520 struct socket *so = qp->so; 1521 1522 /* Shut down kthreads and clear upcalls */ 1523 SOCKBUF_LOCK(&so->so_snd); 1524 qp->tx_shutdown = true; 1525 if (qp->tx_thread != NULL) { 1526 cv_signal(&qp->tx_cv); 1527 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, 1528 "nvtcptx", 0); 1529 } 1530 soupcall_clear(so, SO_SND); 1531 SOCKBUF_UNLOCK(&so->so_snd); 1532 1533 SOCKBUF_LOCK(&so->so_rcv); 1534 qp->rx_shutdown = true; 1535 if (qp->rx_thread != NULL) { 1536 cv_signal(&qp->rx_cv); 1537 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, 1538 "nvtcprx", 0); 1539 } 1540 soupcall_clear(so, SO_RCV); 1541 SOCKBUF_UNLOCK(&so->so_rcv); 1542 1543 STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) { 1544 nvmf_abort_capsule_data(&tc->nc, ECONNABORTED); 1545 tcp_release_capsule(tc); 1546 } 1547 mbufq_drain(&qp->tx_pdus); 1548 1549 cv_destroy(&qp->tx_cv); 1550 cv_destroy(&qp->rx_cv); 1551 1552 if (qp->open_ttags != NULL) { 1553 for (u_int i = 0; i < qp->num_ttags; i++) { 1554 cb = qp->open_ttags[i]; 1555 if (cb != NULL) { 1556 cb->tc->active_r2ts--; 1557 cb->error = ECONNABORTED; 1558 tcp_release_command_buffer(cb); 1559 } 1560 } 1561 free(qp->open_ttags, M_NVMF_TCP); 1562 } 1563 1564 mtx_lock(&qp->rx_buffers.lock); 1565 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { 1566 tcp_remove_command_buffer(&qp->rx_buffers, cb); 1567 mtx_unlock(&qp->rx_buffers.lock); 1568 #ifdef INVARIANTS 1569 if (cb->tc != NULL) 1570 cb->tc->pending_r2ts--; 1571 #endif 1572 cb->error = ECONNABORTED; 1573 tcp_release_command_buffer(cb); 1574 mtx_lock(&qp->rx_buffers.lock); 1575 } 1576 mtx_destroy(&qp->rx_buffers.lock); 1577 1578 mtx_lock(&qp->tx_buffers.lock); 1579 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { 1580 tcp_remove_command_buffer(&qp->tx_buffers, cb); 1581 mtx_unlock(&qp->tx_buffers.lock); 1582 cb->error = ECONNABORTED; 1583 tcp_release_command_buffer(cb); 1584 mtx_lock(&qp->tx_buffers.lock); 1585 } 1586 mtx_destroy(&qp->tx_buffers.lock); 1587 1588 soclose(so); 1589 1590 tcp_release_qpair(qp); 1591 } 1592 1593 static struct nvmf_capsule * 1594 tcp_allocate_capsule(struct nvmf_qpair *nq, int how) 1595 { 1596 struct nvmf_tcp_qpair *qp = TQP(nq); 1597 struct nvmf_tcp_capsule *tc; 1598 1599 tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO); 1600 if (tc == NULL) 1601 return (NULL); 1602 refcount_init(&tc->refs, 1); 1603 refcount_acquire(&qp->refs); 1604 return (&tc->nc); 1605 } 1606 1607 static void 1608 tcp_release_capsule(struct nvmf_tcp_capsule *tc) 1609 { 1610 struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair); 1611 1612 if (!refcount_release(&tc->refs)) 1613 return; 1614 1615 MPASS(tc->active_r2ts == 0); 1616 MPASS(tc->pending_r2ts == 0); 1617 1618 nvmf_tcp_free_pdu(&tc->rx_pdu); 1619 free(tc, M_NVMF_TCP); 1620 tcp_release_qpair(qp); 1621 } 1622 1623 static void 1624 tcp_free_capsule(struct nvmf_capsule *nc) 1625 { 1626 struct nvmf_tcp_capsule *tc = TCAP(nc); 1627 1628 tcp_release_capsule(tc); 1629 } 1630 1631 static int 1632 tcp_transmit_capsule(struct nvmf_capsule *nc) 1633 { 1634 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); 1635 struct nvmf_tcp_capsule *tc = TCAP(nc); 1636 struct socket *so = qp->so; 1637 1638 refcount_acquire(&tc->refs); 1639 SOCKBUF_LOCK(&so->so_snd); 1640 STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link); 1641 if (sowriteable(so)) 1642 cv_signal(&qp->tx_cv); 1643 SOCKBUF_UNLOCK(&so->so_snd); 1644 return (0); 1645 } 1646 1647 static uint8_t 1648 tcp_validate_command_capsule(struct nvmf_capsule *nc) 1649 { 1650 struct nvmf_tcp_capsule *tc = TCAP(nc); 1651 struct nvme_sgl_descriptor *sgl; 1652 1653 KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received")); 1654 1655 sgl = &nc->nc_sqe.sgl; 1656 switch (sgl->type) { 1657 case NVME_SGL_TYPE_ICD: 1658 if (tc->rx_pdu.data_len != le32toh(sgl->length)) { 1659 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); 1660 return (NVME_SC_DATA_SGL_LENGTH_INVALID); 1661 } 1662 break; 1663 case NVME_SGL_TYPE_COMMAND_BUFFER: 1664 if (tc->rx_pdu.data_len != 0) { 1665 printf("NVMe/TCP: Command Buffer SGL with ICD\n"); 1666 return (NVME_SC_INVALID_FIELD); 1667 } 1668 break; 1669 default: 1670 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); 1671 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); 1672 } 1673 1674 if (sgl->address != 0) { 1675 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); 1676 return (NVME_SC_SGL_OFFSET_INVALID); 1677 } 1678 1679 return (NVME_SC_SUCCESS); 1680 } 1681 1682 static size_t 1683 tcp_capsule_data_len(const struct nvmf_capsule *nc) 1684 { 1685 MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); 1686 return (le32toh(nc->nc_sqe.sgl.length)); 1687 } 1688 1689 static void 1690 tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, 1691 struct nvmf_io_request *io) 1692 { 1693 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); 1694 struct nvmf_tcp_capsule *tc = TCAP(nc); 1695 struct nvmf_tcp_command_buffer *cb; 1696 1697 cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len, 1698 nc->nc_sqe.cid); 1699 1700 cb->tc = tc; 1701 refcount_acquire(&tc->refs); 1702 1703 /* 1704 * If this command has too many active R2Ts or there are no 1705 * available transfer tags, queue the request for later. 1706 * 1707 * NB: maxr2t is 0's based. 1708 */ 1709 mtx_lock(&qp->rx_buffers.lock); 1710 if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) { 1711 #ifdef INVARIANTS 1712 tc->pending_r2ts++; 1713 #endif 1714 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); 1715 mtx_unlock(&qp->rx_buffers.lock); 1716 return; 1717 } 1718 1719 nvmf_tcp_allocate_ttag(qp, cb); 1720 mtx_unlock(&qp->rx_buffers.lock); 1721 1722 tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); 1723 } 1724 1725 static void 1726 tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, 1727 struct nvmf_io_request *io) 1728 { 1729 struct nvmf_tcp_capsule *tc = TCAP(nc); 1730 1731 mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset, 1732 io->io_len, io, 0); 1733 nvmf_complete_io_request(io, io->io_len, 0); 1734 } 1735 1736 static int 1737 tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 1738 struct nvmf_io_request *io) 1739 { 1740 struct nvme_sgl_descriptor *sgl; 1741 size_t data_len; 1742 1743 if (nc->nc_qe_len != sizeof(struct nvme_command) || 1744 !nc->nc_qpair->nq_controller) 1745 return (EINVAL); 1746 1747 sgl = &nc->nc_sqe.sgl; 1748 data_len = le32toh(sgl->length); 1749 if (data_offset + io->io_len > data_len) 1750 return (EFBIG); 1751 1752 if (sgl->type == NVME_SGL_TYPE_ICD) 1753 tcp_receive_icd_data(nc, data_offset, io); 1754 else 1755 tcp_receive_r2t_data(nc, data_offset, io); 1756 return (0); 1757 } 1758 1759 /* NB: cid is little-endian already. */ 1760 static void 1761 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset, 1762 struct mbuf *m, size_t len, bool last_pdu, bool success) 1763 { 1764 struct nvme_tcp_c2h_data_hdr c2h; 1765 struct mbuf *top; 1766 1767 memset(&c2h, 0, sizeof(c2h)); 1768 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; 1769 if (last_pdu) 1770 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; 1771 if (success) 1772 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; 1773 c2h.cccid = cid; 1774 c2h.datao = htole32(data_offset); 1775 c2h.datal = htole32(len); 1776 1777 top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len); 1778 nvmf_tcp_write_pdu(qp, top); 1779 } 1780 1781 static u_int 1782 tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 1783 struct mbuf *m, size_t len) 1784 { 1785 struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); 1786 struct nvme_sgl_descriptor *sgl; 1787 struct mbuf *n, *p; 1788 uint32_t data_len; 1789 bool last_pdu, last_xfer; 1790 1791 if (nc->nc_qe_len != sizeof(struct nvme_command) || 1792 !qp->qp.nq_controller) { 1793 m_freem(m); 1794 return (NVME_SC_INVALID_FIELD); 1795 } 1796 1797 sgl = &nc->nc_sqe.sgl; 1798 data_len = le32toh(sgl->length); 1799 if (data_offset + len > data_len) { 1800 m_freem(m); 1801 return (NVME_SC_INVALID_FIELD); 1802 } 1803 last_xfer = (data_offset + len == data_len); 1804 1805 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { 1806 m_freem(m); 1807 return (NVME_SC_INVALID_FIELD); 1808 } 1809 1810 KASSERT(data_offset == TCAP(nc)->tx_data_offset, 1811 ("%s: starting data_offset %u doesn't match end of previous xfer %u", 1812 __func__, data_offset, TCAP(nc)->tx_data_offset)); 1813 1814 /* Queue one more C2H_DATA PDUs containing the data from 'm'. */ 1815 while (m != NULL) { 1816 uint32_t todo; 1817 1818 todo = m->m_len; 1819 p = m; 1820 n = p->m_next; 1821 while (n != NULL) { 1822 if (todo + n->m_len > qp->max_tx_data) { 1823 p->m_next = NULL; 1824 break; 1825 } 1826 todo += n->m_len; 1827 p = n; 1828 n = p->m_next; 1829 } 1830 MPASS(m_length(m, NULL) == todo); 1831 1832 last_pdu = (n == NULL && last_xfer); 1833 tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, 1834 last_pdu, last_pdu && qp->send_success); 1835 1836 data_offset += todo; 1837 data_len -= todo; 1838 m = n; 1839 } 1840 MPASS(data_len == 0); 1841 1842 #ifdef INVARIANTS 1843 TCAP(nc)->tx_data_offset = data_offset; 1844 #endif 1845 if (!last_xfer) 1846 return (NVMF_MORE); 1847 else if (qp->send_success) 1848 return (NVMF_SUCCESS_SENT); 1849 else 1850 return (NVME_SC_SUCCESS); 1851 } 1852 1853 struct nvmf_transport_ops tcp_ops = { 1854 .allocate_qpair = tcp_allocate_qpair, 1855 .free_qpair = tcp_free_qpair, 1856 .allocate_capsule = tcp_allocate_capsule, 1857 .free_capsule = tcp_free_capsule, 1858 .transmit_capsule = tcp_transmit_capsule, 1859 .validate_command_capsule = tcp_validate_command_capsule, 1860 .capsule_data_len = tcp_capsule_data_len, 1861 .receive_controller_data = tcp_receive_controller_data, 1862 .send_controller_data = tcp_send_controller_data, 1863 .trtype = NVMF_TRTYPE_TCP, 1864 .priority = 0, 1865 }; 1866 1867 NVMF_TRANSPORT(tcp, tcp_ops); 1868