1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "opt_inet.h" 30 31 #include <sys/param.h> 32 #include <sys/libkern.h> 33 #include <sys/kernel.h> 34 #include <sys/module.h> 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/bitset.h> 38 #include <sys/capsicum.h> 39 #include <sys/file.h> 40 #include <sys/kthread.h> 41 #include <sys/ktr.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/nv.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <netinet/in.h> 49 #include <netinet/in_pcb.h> 50 #include <netinet/tcp_var.h> 51 #include <netinet/toecore.h> 52 53 #include <dev/nvmf/nvmf.h> 54 #include <dev/nvmf/nvmf_proto.h> 55 #include <dev/nvmf/nvmf_tcp.h> 56 #include <dev/nvmf/nvmf_transport.h> 57 #include <dev/nvmf/nvmf_transport_internal.h> 58 59 #include <vm/pmap.h> 60 #include <vm/vm_page.h> 61 62 #include "common/common.h" 63 #include "common/t4_regs.h" 64 #include "common/t4_tcb.h" 65 #include "tom/t4_tom.h" 66 67 /* Status code values in CPL_NVMT_CMP. */ 68 #define CMP_STATUS_ERROR_MASK 0x7f 69 #define CMP_STATUS_NO_ERROR 0 70 #define CMP_STATUS_HEADER_DIGEST 1 71 #define CMP_STATUS_DIRECTION_MISMATCH 2 72 #define CMP_STATUS_DIGEST_FLAG_MISMATCH 3 73 #define CMP_STATUS_SUCCESS_NOT_LAST 4 74 #define CMP_STATUS_BAD_DATA_LENGTH 5 75 #define CMP_STATUS_USER_MODE_UNALLOCATED 6 76 #define CMP_STATUS_RQT_LIMIT 7 77 #define CMP_STATUS_RQT_WRAP 8 78 #define CMP_STATUS_RQT_BOUND 9 79 #define CMP_STATUS_TPT_LIMIT 16 80 #define CMP_STATUS_TPT_INVALID 17 81 #define CMP_STATUS_TPT_COLOUR_MISMATCH 18 82 #define CMP_STATUS_TPT_MISC 19 83 #define CMP_STATUS_TPT_WRAP 20 84 #define CMP_STATUS_TPT_BOUND 21 85 #define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22 86 #define CMP_STATUS_PBL_LIMIT 24 87 #define CMP_STATUS_DATA_DIGEST 25 88 #define CMP_STATUS_DDP 0x80 89 90 /* 91 * Transfer tags and CIDs with the MSB set are "unallocated" tags that 92 * pass data through to the freelist without using DDP. 93 */ 94 #define CHE_FL_TAG_MASK 0x8000 95 #define CHE_MAX_FL_TAG 0x7fff 96 #define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1) 97 98 #define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK) 99 #define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK) 100 #define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color)) 101 #define CHE_STAG_COLOR(stag) ((stag) & 0xf) 102 #define CHE_STAG_IDX(stag) ((stag) >> 4) 103 #define CHE_DDP_MAX_COLOR 0xf 104 105 #define CHE_DDP_NO_TAG 0xffff 106 107 /* 108 * A bitmap of non-DDP CIDs in use on the host. Since there is no 109 * _BIT_FFC (find first clear), the bitset is inverted so that a clear 110 * bit indicates an in-use CID. 111 */ 112 BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS); 113 #define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p) 114 #define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p) 115 #define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p) 116 #define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p) 117 #define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start) 118 119 /* 120 * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP 121 * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus. 122 */ 123 #define nvmf_tcp_seq PH_loc.thirtytwo[0] 124 125 /* 126 * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf 127 * is in qp->rx_pdus. 128 */ 129 #define nvmf_cpl_status PH_loc.eight[4] 130 131 struct nvmf_che_capsule; 132 struct nvmf_che_qpair; 133 134 struct nvmf_che_adapter { 135 struct adapter *sc; 136 137 u_int ddp_threshold; 138 u_int max_transmit_pdu; 139 u_int max_receive_pdu; 140 bool nvmt_data_iqe; 141 142 struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */ 143 }; 144 145 struct nvmf_che_command_buffer { 146 struct nvmf_che_qpair *qp; 147 148 struct nvmf_io_request io; 149 size_t data_len; 150 size_t data_xfered; 151 uint32_t data_offset; 152 153 u_int refs; 154 int error; 155 156 bool ddp_ok; 157 uint16_t cid; 158 uint16_t ttag; 159 uint16_t original_cid; /* Host only */ 160 161 TAILQ_ENTRY(nvmf_che_command_buffer) link; 162 163 /* Fields used for DDP. */ 164 struct fw_ri_tpte tpte; 165 uint64_t *pbl; 166 uint32_t pbl_addr; 167 uint32_t pbl_len; 168 169 /* Controller only */ 170 struct nvmf_che_capsule *cc; 171 }; 172 173 struct nvmf_che_command_buffer_list { 174 TAILQ_HEAD(, nvmf_che_command_buffer) head; 175 struct mtx lock; 176 }; 177 178 struct nvmf_che_qpair { 179 struct nvmf_qpair qp; 180 181 struct socket *so; 182 struct toepcb *toep; 183 struct nvmf_che_adapter *nca; 184 185 volatile u_int refs; /* Every allocated capsule holds a reference */ 186 uint8_t txpda; 187 uint8_t rxpda; 188 bool header_digests; 189 bool data_digests; 190 uint32_t maxr2t; 191 uint32_t maxh2cdata; /* Controller only */ 192 uint32_t max_rx_data; 193 uint32_t max_tx_data; 194 uint32_t max_icd; /* Host only */ 195 uint32_t max_ioccsz; /* Controller only */ 196 union { 197 uint16_t next_fl_ttag; /* Controller only */ 198 uint16_t next_cid; /* Host only */ 199 }; 200 uint16_t next_ddp_tag; 201 u_int num_fl_ttags; /* Controller only */ 202 u_int active_fl_ttags; /* Controller only */ 203 u_int num_ddp_tags; 204 u_int active_ddp_tags; 205 bool send_success; /* Controller only */ 206 uint8_t ddp_color; 207 uint32_t tpt_offset; 208 209 /* Receive state. */ 210 struct thread *rx_thread; 211 struct cv rx_cv; 212 bool rx_shutdown; 213 int rx_error; 214 struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */ 215 struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */ 216 217 /* Transmit state. */ 218 struct thread *tx_thread; 219 struct cv tx_cv; 220 bool tx_shutdown; 221 STAILQ_HEAD(, nvmf_che_capsule) tx_capsules; 222 223 struct nvmf_che_command_buffer_list tx_buffers; 224 struct nvmf_che_command_buffer_list rx_buffers; 225 226 /* 227 * For the controller, an RX command buffer can be in one of 228 * three locations, all protected by the rx_buffers.lock. If 229 * a receive request is waiting for either an R2T slot for its 230 * command (due to exceeding MAXR2T), or a transfer tag it is 231 * placed on the rx_buffers list. When a request is allocated 232 * an active transfer tag, it moves to either the 233 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the 234 * tag) until it completes. 235 * 236 * For the host, an RX command buffer using DDP is in 237 * open_ddp_tags[], otherwise it is in rx_buffers. 238 */ 239 struct nvmf_che_command_buffer **open_ddp_tags; 240 struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */ 241 242 /* 243 * For the host, CIDs submitted by nvmf(4) must be rewritten 244 * to either use DDP or not use DDP. The CID in response 245 * capsules must be restored to their original value. For 246 * DDP, the original CID is stored in the command buffer. 247 * These variables manage non-DDP CIDs. 248 */ 249 uint16_t *fl_cids; /* Host only */ 250 struct fl_cid_set *fl_cid_set; /* Host only */ 251 struct mtx fl_cid_lock; /* Host only */ 252 }; 253 254 struct nvmf_che_rxpdu { 255 struct mbuf *m; 256 const struct nvme_tcp_common_pdu_hdr *hdr; 257 uint32_t data_len; 258 bool data_digest_mismatch; 259 bool ddp; 260 }; 261 262 struct nvmf_che_capsule { 263 struct nvmf_capsule nc; 264 265 volatile u_int refs; 266 267 struct nvmf_che_rxpdu rx_pdu; 268 269 uint32_t active_r2ts; /* Controller only */ 270 #ifdef INVARIANTS 271 uint32_t tx_data_offset; /* Controller only */ 272 u_int pending_r2ts; /* Controller only */ 273 #endif 274 275 STAILQ_ENTRY(nvmf_che_capsule) link; 276 }; 277 278 #define CCAP(nc) ((struct nvmf_che_capsule *)(nc)) 279 #define CQP(qp) ((struct nvmf_che_qpair *)(qp)) 280 281 static void che_release_capsule(struct nvmf_che_capsule *cc); 282 static void che_free_qpair(struct nvmf_qpair *nq); 283 284 SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 285 "Chelsio TCP offload transport"); 286 287 static u_int che_max_transmit_pdu = 32 * 1024; 288 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN, 289 &che_max_transmit_pdu, 0, 290 "Maximum size of a transmitted PDU"); 291 292 static u_int che_max_receive_pdu = 32 * 1024; 293 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN, 294 &che_max_receive_pdu, 0, 295 "Maximum size of a received PDU"); 296 297 static int use_dsgl = 1; 298 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0, 299 "Use DSGL for PBL/FastReg (default=1)"); 300 301 static int inline_threshold = 256; 302 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN, 303 &inline_threshold, 0, 304 "inline vs dsgl threshold (default=256)"); 305 306 static int ddp_tags_per_qp = 128; 307 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN, 308 &ddp_tags_per_qp, 0, 309 "Number of DDP tags to reserve for each queue pair"); 310 311 static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload"); 312 313 /* 314 * PBL regions consist of N full-sized pages. TPT entries support an 315 * initial offset into the first page (FBO) and can handle a partial 316 * length on the last page. 317 */ 318 static bool 319 che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io) 320 { 321 const struct memdesc *mem = &io->io_mem; 322 struct bus_dma_segment *ds; 323 int i; 324 325 if (io->io_len < qp->nca->ddp_threshold) { 326 return (false); 327 } 328 329 switch (mem->md_type) { 330 case MEMDESC_VADDR: 331 case MEMDESC_PADDR: 332 case MEMDESC_VMPAGES: 333 return (true); 334 case MEMDESC_VLIST: 335 case MEMDESC_PLIST: 336 /* 337 * Require all but the first segment to start on a 338 * page boundary. Require all but the last segment to 339 * end on a page boundary. 340 */ 341 ds = mem->u.md_list; 342 for (i = 0; i < mem->md_nseg; i++, ds++) { 343 if (i != 0 && ds->ds_addr % PAGE_SIZE != 0) 344 return (false); 345 if (i != mem->md_nseg - 1 && 346 (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0) 347 return (false); 348 } 349 return (true); 350 default: 351 /* 352 * Other types could be validated with more work, but 353 * they aren't used currently by nvmf(4) or nvmft(4). 354 */ 355 return (false); 356 } 357 } 358 359 static u_int 360 che_fbo(struct nvmf_che_command_buffer *cb) 361 { 362 struct memdesc *mem = &cb->io.io_mem; 363 364 switch (mem->md_type) { 365 case MEMDESC_VADDR: 366 return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK); 367 case MEMDESC_PADDR: 368 return (mem->u.md_paddr & PAGE_MASK); 369 case MEMDESC_VMPAGES: 370 return (mem->md_offset); 371 case MEMDESC_VLIST: 372 case MEMDESC_PLIST: 373 return (mem->u.md_list[0].ds_addr & PAGE_MASK); 374 default: 375 __assert_unreachable(); 376 } 377 } 378 379 static u_int 380 che_npages(struct nvmf_che_command_buffer *cb) 381 { 382 return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE)); 383 } 384 385 static struct nvmf_che_command_buffer * 386 che_alloc_command_buffer(struct nvmf_che_qpair *qp, 387 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, 388 uint16_t cid) 389 { 390 struct nvmf_che_command_buffer *cb; 391 392 cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK); 393 cb->qp = qp; 394 cb->io = *io; 395 cb->data_offset = data_offset; 396 cb->data_len = data_len; 397 cb->data_xfered = 0; 398 refcount_init(&cb->refs, 1); 399 cb->error = 0; 400 cb->ddp_ok = che_ddp_io_check(qp, io); 401 cb->cid = cid; 402 cb->ttag = 0; 403 cb->original_cid = 0; 404 cb->cc = NULL; 405 cb->pbl = NULL; 406 407 return (cb); 408 } 409 410 static void 411 che_hold_command_buffer(struct nvmf_che_command_buffer *cb) 412 { 413 refcount_acquire(&cb->refs); 414 } 415 416 static void 417 che_free_command_buffer(struct nvmf_che_command_buffer *cb) 418 { 419 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); 420 if (cb->cc != NULL) 421 che_release_capsule(cb->cc); 422 MPASS(cb->pbl == NULL); 423 free(cb, M_NVMF_CHE); 424 } 425 426 static void 427 che_release_command_buffer(struct nvmf_che_command_buffer *cb) 428 { 429 if (refcount_release(&cb->refs)) 430 che_free_command_buffer(cb); 431 } 432 433 static void 434 che_add_command_buffer(struct nvmf_che_command_buffer_list *list, 435 struct nvmf_che_command_buffer *cb) 436 { 437 mtx_assert(&list->lock, MA_OWNED); 438 TAILQ_INSERT_HEAD(&list->head, cb, link); 439 } 440 441 static struct nvmf_che_command_buffer * 442 che_find_command_buffer(struct nvmf_che_command_buffer_list *list, 443 uint16_t cid) 444 { 445 struct nvmf_che_command_buffer *cb; 446 447 mtx_assert(&list->lock, MA_OWNED); 448 TAILQ_FOREACH(cb, &list->head, link) { 449 if (cb->cid == cid) 450 return (cb); 451 } 452 return (NULL); 453 } 454 455 static void 456 che_remove_command_buffer(struct nvmf_che_command_buffer_list *list, 457 struct nvmf_che_command_buffer *cb) 458 { 459 mtx_assert(&list->lock, MA_OWNED); 460 TAILQ_REMOVE(&list->head, cb, link); 461 } 462 463 static void 464 che_purge_command_buffer(struct nvmf_che_command_buffer_list *list, 465 uint16_t cid) 466 { 467 struct nvmf_che_command_buffer *cb; 468 469 mtx_lock(&list->lock); 470 cb = che_find_command_buffer(list, cid); 471 if (cb != NULL) { 472 che_remove_command_buffer(list, cb); 473 mtx_unlock(&list->lock); 474 che_release_command_buffer(cb); 475 } else 476 mtx_unlock(&list->lock); 477 } 478 479 static int 480 che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr, 481 uint32_t len, void *data, struct mbufq *wrq) 482 { 483 struct mbuf *m; 484 char *cp; 485 int copy_len, i, num_wqe, wr_len; 486 487 #ifdef VERBOSE_TRACES 488 CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len); 489 #endif 490 num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE); 491 cp = data; 492 for (i = 0; i < num_wqe; i++) { 493 copy_len = min(len, T4_MAX_INLINE_SIZE); 494 wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len); 495 496 m = alloc_raw_wr_mbuf(wr_len); 497 if (m == NULL) 498 return (ENOMEM); 499 t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid, 500 addr, copy_len, cp, 0); 501 if (cp != NULL) 502 cp += T4_MAX_INLINE_SIZE; 503 addr += T4_MAX_INLINE_SIZE >> 5; 504 len -= T4_MAX_INLINE_SIZE; 505 506 mbufq_enqueue(wrq, m); 507 } 508 return (0); 509 } 510 511 static int 512 che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep, 513 uint32_t addr, uint32_t len, void *data, struct mbufq *wrq) 514 { 515 struct mbuf *m; 516 vm_offset_t va; 517 u_int todo; 518 int wr_len; 519 520 /* First page. */ 521 va = (vm_offset_t)data; 522 todo = min(PAGE_SIZE - (va % PAGE_SIZE), len); 523 wr_len = T4_WRITE_MEM_DMA_LEN; 524 m = alloc_raw_wr_mbuf(wr_len); 525 if (m == NULL) 526 return (ENOMEM); 527 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr, 528 todo, pmap_kextract(va), 0); 529 mbufq_enqueue(wrq, m); 530 len -= todo; 531 addr += todo >> 5; 532 va += todo; 533 534 while (len > 0) { 535 MPASS(va == trunc_page(va)); 536 todo = min(PAGE_SIZE, len); 537 m = alloc_raw_wr_mbuf(wr_len); 538 if (m == NULL) 539 return (ENOMEM); 540 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, 541 addr, todo, pmap_kextract(va), 0); 542 mbufq_enqueue(wrq, m); 543 len -= todo; 544 addr += todo >> 5; 545 va += todo; 546 } 547 return (0); 548 } 549 550 static int 551 che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len, 552 void *data) 553 { 554 struct adapter *sc = qp->nca->sc; 555 struct toepcb *toep = qp->toep; 556 struct socket *so = qp->so; 557 struct inpcb *inp = sotoinpcb(so); 558 struct mbufq mq; 559 int error; 560 561 mbufq_init(&mq, INT_MAX); 562 if (!use_dsgl || len < inline_threshold || data == NULL) 563 error = che_write_mem_inline(sc, toep, addr, len, data, &mq); 564 else 565 error = che_write_mem_dma_aligned(sc, toep, addr, len, data, 566 &mq); 567 if (__predict_false(error != 0)) 568 goto error; 569 570 INP_WLOCK(inp); 571 if ((inp->inp_flags & INP_DROPPED) != 0) { 572 INP_WUNLOCK(inp); 573 error = ECONNRESET; 574 goto error; 575 } 576 mbufq_concat(&toep->ulp_pduq, &mq); 577 INP_WUNLOCK(inp); 578 return (0); 579 580 error: 581 mbufq_drain(&mq); 582 return (error); 583 } 584 585 static bool 586 che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) 587 { 588 struct adapter *sc = qp->nca->sc; 589 struct memdesc *mem = &cb->io.io_mem; 590 uint64_t *pbl; 591 uint32_t addr, len; 592 u_int i, npages; 593 int error; 594 595 MPASS(cb->pbl == NULL); 596 MPASS(cb->ddp_ok); 597 598 /* Hardware limit? iWARP only enforces this for T5. */ 599 if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL)) 600 return (false); 601 602 npages = che_npages(cb); 603 len = roundup2(npages, 4) * sizeof(*cb->pbl); 604 addr = t4_pblpool_alloc(sc, len); 605 if (addr == 0) 606 return (false); 607 608 pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO); 609 if (pbl == NULL) { 610 t4_pblpool_free(sc, addr, len); 611 return (false); 612 } 613 614 switch (mem->md_type) { 615 case MEMDESC_VADDR: 616 { 617 vm_offset_t va; 618 619 va = trunc_page((uintptr_t)mem->u.md_vaddr); 620 for (i = 0; i < npages; i++) 621 pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE)); 622 break; 623 } 624 case MEMDESC_PADDR: 625 { 626 vm_paddr_t pa; 627 628 pa = trunc_page(mem->u.md_paddr); 629 for (i = 0; i < npages; i++) 630 pbl[i] = htobe64(pa + i * PAGE_SIZE); 631 break; 632 } 633 case MEMDESC_VMPAGES: 634 for (i = 0; i < npages; i++) 635 pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i])); 636 break; 637 case MEMDESC_VLIST: 638 { 639 struct bus_dma_segment *ds; 640 vm_offset_t va; 641 vm_size_t len; 642 u_int j, k; 643 644 i = 0; 645 ds = mem->u.md_list; 646 for (j = 0; j < mem->md_nseg; j++, ds++) { 647 va = trunc_page((uintptr_t)ds->ds_addr); 648 len = ds->ds_len; 649 if (ds->ds_addr % PAGE_SIZE != 0) 650 len += ds->ds_addr % PAGE_SIZE; 651 for (k = 0; k < howmany(len, PAGE_SIZE); k++) { 652 pbl[i] = htobe64(pmap_kextract(va + 653 k * PAGE_SIZE)); 654 i++; 655 } 656 } 657 MPASS(i == npages); 658 break; 659 } 660 case MEMDESC_PLIST: 661 { 662 struct bus_dma_segment *ds; 663 vm_paddr_t pa; 664 vm_size_t len; 665 u_int j, k; 666 667 i = 0; 668 ds = mem->u.md_list; 669 for (j = 0; j < mem->md_nseg; j++, ds++) { 670 pa = trunc_page((vm_paddr_t)ds->ds_addr); 671 len = ds->ds_len; 672 if (ds->ds_addr % PAGE_SIZE != 0) 673 len += ds->ds_addr % PAGE_SIZE; 674 for (k = 0; k < howmany(len, PAGE_SIZE); k++) { 675 pbl[i] = htobe64(pa + k * PAGE_SIZE); 676 i++; 677 } 678 } 679 MPASS(i == npages); 680 break; 681 } 682 default: 683 __assert_unreachable(); 684 } 685 686 error = che_write_adapter_mem(qp, addr >> 5, len, pbl); 687 if (error != 0) { 688 t4_pblpool_free(sc, addr, len); 689 free(pbl, M_NVMF_CHE); 690 return (false); 691 } 692 693 cb->pbl = pbl; 694 cb->pbl_addr = addr; 695 cb->pbl_len = len; 696 697 return (true); 698 } 699 700 static void 701 che_free_pbl(struct nvmf_che_command_buffer *cb) 702 { 703 free(cb->pbl, M_NVMF_CHE); 704 t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len); 705 cb->pbl = NULL; 706 cb->pbl_addr = 0; 707 cb->pbl_len = 0; 708 } 709 710 static bool 711 che_write_tpt_entry(struct nvmf_che_qpair *qp, 712 struct nvmf_che_command_buffer *cb, uint16_t stag) 713 { 714 uint32_t tpt_addr; 715 int error; 716 717 cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID | 718 V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) | 719 F_FW_RI_TPTE_STAGSTATE | 720 V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) | 721 V_FW_RI_TPTE_PDID(0)); 722 cb->tpte.locread_to_qpid = htobe32( 723 V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) | 724 V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) | 725 V_FW_RI_TPTE_PS(PAGE_SIZE) | 726 V_FW_RI_TPTE_QPID(qp->toep->tid)); 727 #define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start) 728 cb->tpte.nosnoop_pbladdr = 729 htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3)); 730 cb->tpte.len_lo = htobe32(cb->data_len); 731 cb->tpte.va_hi = 0; 732 cb->tpte.va_lo_fbo = htobe32(che_fbo(cb)); 733 cb->tpte.dca_mwbcnt_pstag = 0; 734 cb->tpte.len_hi = htobe32(cb->data_offset); 735 736 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + 737 (qp->nca->sc->vres.stag.start >> 5); 738 739 error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte), 740 &cb->tpte); 741 return (error == 0); 742 } 743 744 static void 745 che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag) 746 { 747 uint32_t tpt_addr; 748 749 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + 750 (qp->nca->sc->vres.stag.start >> 5); 751 752 (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte), 753 NULL); 754 } 755 756 static uint16_t 757 che_alloc_ddp_stag(struct nvmf_che_qpair *qp, 758 struct nvmf_che_command_buffer *cb) 759 { 760 uint16_t stag_idx; 761 762 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 763 MPASS(cb->ddp_ok); 764 765 if (qp->active_ddp_tags == qp->num_ddp_tags) 766 return (CHE_DDP_NO_TAG); 767 768 MPASS(qp->num_ddp_tags != 0); 769 770 stag_idx = qp->next_ddp_tag; 771 for (;;) { 772 if (qp->open_ddp_tags[stag_idx] == NULL) 773 break; 774 if (stag_idx == qp->num_ddp_tags - 1) { 775 stag_idx = 0; 776 if (qp->ddp_color == CHE_DDP_MAX_COLOR) 777 qp->ddp_color = 0; 778 else 779 qp->ddp_color++; 780 } else 781 stag_idx++; 782 MPASS(stag_idx != qp->next_ddp_tag); 783 } 784 if (stag_idx == qp->num_ddp_tags - 1) 785 qp->next_ddp_tag = 0; 786 else 787 qp->next_ddp_tag = stag_idx + 1; 788 789 qp->active_ddp_tags++; 790 qp->open_ddp_tags[stag_idx] = cb; 791 792 return (CHE_DDP_TAG(stag_idx, qp->ddp_color)); 793 } 794 795 static void 796 che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, 797 uint16_t stag) 798 { 799 MPASS(!CHE_TAG_IS_FL(stag)); 800 801 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 802 803 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); 804 805 qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL; 806 qp->active_ddp_tags--; 807 } 808 809 static uint16_t 810 che_alloc_ddp_tag(struct nvmf_che_qpair *qp, 811 struct nvmf_che_command_buffer *cb) 812 { 813 uint16_t stag; 814 815 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 816 817 if (!cb->ddp_ok) 818 return (CHE_DDP_NO_TAG); 819 820 stag = che_alloc_ddp_stag(qp, cb); 821 if (stag == CHE_DDP_NO_TAG) { 822 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag, 823 1); 824 return (CHE_DDP_NO_TAG); 825 } 826 827 if (!che_alloc_pbl(qp, cb)) { 828 che_free_ddp_stag(qp, cb, stag); 829 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); 830 return (CHE_DDP_NO_TAG); 831 } 832 833 if (!che_write_tpt_entry(qp, cb, stag)) { 834 che_free_pbl(cb); 835 che_free_ddp_stag(qp, cb, stag); 836 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); 837 return (CHE_DDP_NO_TAG); 838 } 839 840 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1); 841 return (stag); 842 } 843 844 static void 845 che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, 846 uint16_t stag) 847 { 848 MPASS(!CHE_TAG_IS_FL(stag)); 849 850 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 851 852 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); 853 854 che_clear_tpt_entry(qp, stag); 855 che_free_pbl(cb); 856 che_free_ddp_stag(qp, cb, stag); 857 } 858 859 static void 860 nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m) 861 { 862 struct epoch_tracker et; 863 struct socket *so = qp->so; 864 struct inpcb *inp = sotoinpcb(so); 865 struct toepcb *toep = qp->toep; 866 867 CURVNET_SET(so->so_vnet); 868 NET_EPOCH_ENTER(et); 869 INP_WLOCK(inp); 870 if (__predict_false(inp->inp_flags & INP_DROPPED) || 871 __predict_false((toep->flags & TPF_ATTACHED) == 0)) { 872 m_freem(m); 873 } else { 874 mbufq_enqueue(&toep->ulp_pduq, m); 875 t4_push_pdus(toep->vi->adapter, toep, 0); 876 } 877 INP_WUNLOCK(inp); 878 NET_EPOCH_EXIT(et); 879 CURVNET_RESTORE(); 880 } 881 882 static void 883 nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei, 884 struct mbuf *rx_pdu, u_int hlen) 885 { 886 struct nvme_tcp_term_req_hdr *hdr; 887 struct mbuf *m; 888 889 if (hlen != 0) { 890 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); 891 hlen = min(hlen, m_length(rx_pdu, NULL)); 892 } 893 894 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR); 895 m->m_len = sizeof(*hdr) + hlen; 896 m->m_pkthdr.len = m->m_len; 897 hdr = mtod(m, void *); 898 memset(hdr, 0, sizeof(*hdr)); 899 hdr->common.pdu_type = qp->qp.nq_controller ? 900 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; 901 hdr->common.hlen = sizeof(*hdr); 902 hdr->common.plen = sizeof(*hdr) + hlen; 903 hdr->fes = htole16(fes); 904 le32enc(hdr->fei, fei); 905 if (hlen != 0) 906 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); 907 908 nvmf_che_write_pdu(qp, m); 909 } 910 911 static int 912 nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 913 { 914 const struct nvme_tcp_common_pdu_hdr *ch; 915 struct mbuf *m = pdu->m; 916 uint32_t data_len, fei, plen, rx_digest; 917 u_int hlen, cpl_error; 918 int error; 919 uint16_t fes; 920 921 /* Determine how large of a PDU header to return for errors. */ 922 ch = pdu->hdr; 923 hlen = ch->hlen; 924 plen = le32toh(ch->plen); 925 if (hlen < sizeof(*ch) || hlen > plen) 926 hlen = sizeof(*ch); 927 928 cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK; 929 switch (cpl_error) { 930 case CMP_STATUS_NO_ERROR: 931 break; 932 case CMP_STATUS_HEADER_DIGEST: 933 counter_u64_add( 934 qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1); 935 printf("NVMe/TCP: Header digest mismatch\n"); 936 rx_digest = le32dec(mtodo(m, ch->hlen)); 937 nvmf_che_report_error(qp, 938 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, 939 hlen); 940 return (EBADMSG); 941 case CMP_STATUS_DIRECTION_MISMATCH: 942 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 943 printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type); 944 nvmf_che_report_error(qp, 945 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 946 offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m, 947 hlen); 948 return (EBADMSG); 949 case CMP_STATUS_SUCCESS_NOT_LAST: 950 case CMP_STATUS_DIGEST_FLAG_MISMATCH: 951 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 952 printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags); 953 nvmf_che_report_error(qp, 954 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 955 offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen); 956 return (EBADMSG); 957 case CMP_STATUS_BAD_DATA_LENGTH: 958 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 959 printf("NVMe/TCP: Invalid PDU length %u\n", plen); 960 nvmf_che_report_error(qp, 961 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 962 offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen); 963 return (EBADMSG); 964 case CMP_STATUS_USER_MODE_UNALLOCATED: 965 case CMP_STATUS_RQT_LIMIT: 966 case CMP_STATUS_RQT_WRAP: 967 case CMP_STATUS_RQT_BOUND: 968 device_printf(qp->nca->sc->dev, 969 "received invalid NVMET error %u\n", 970 cpl_error); 971 return (ECONNRESET); 972 case CMP_STATUS_TPT_LIMIT: 973 case CMP_STATUS_TPT_INVALID: 974 case CMP_STATUS_TPT_COLOUR_MISMATCH: 975 case CMP_STATUS_TPT_MISC: 976 case CMP_STATUS_TPT_WRAP: 977 case CMP_STATUS_TPT_BOUND: 978 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 979 switch (ch->pdu_type) { 980 case NVME_TCP_PDU_TYPE_H2C_DATA: 981 nvmf_che_report_error(qp, 982 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 983 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), 984 pdu->m, pdu->hdr->hlen); 985 return (EBADMSG); 986 case NVME_TCP_PDU_TYPE_C2H_DATA: 987 nvmf_che_report_error(qp, 988 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 989 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m, 990 hlen); 991 return (EBADMSG); 992 default: 993 device_printf(qp->nca->sc->dev, 994 "received DDP NVMET error %u for PDU %u\n", 995 cpl_error, ch->pdu_type); 996 return (ECONNRESET); 997 } 998 case CMP_STATUS_TPT_LAST_PDU_UNALIGNED: 999 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 1000 nvmf_che_report_error(qp, 1001 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen); 1002 return (EBADMSG); 1003 case CMP_STATUS_PBL_LIMIT: 1004 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 1005 nvmf_che_report_error(qp, 1006 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m, 1007 hlen); 1008 return (EBADMSG); 1009 case CMP_STATUS_DATA_DIGEST: 1010 /* Handled below. */ 1011 break; 1012 default: 1013 device_printf(qp->nca->sc->dev, 1014 "received unknown NVMET error %u\n", 1015 cpl_error); 1016 return (ECONNRESET); 1017 } 1018 1019 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, 1020 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, 1021 &fei); 1022 if (error != 0) { 1023 if (error != ECONNRESET) 1024 nvmf_che_report_error(qp, fes, fei, m, hlen); 1025 return (error); 1026 } 1027 1028 /* Check data digest if present. */ 1029 pdu->data_digest_mismatch = false; 1030 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { 1031 if (cpl_error == CMP_STATUS_DATA_DIGEST) { 1032 printf("NVMe/TCP: Data digest mismatch\n"); 1033 pdu->data_digest_mismatch = true; 1034 counter_u64_add( 1035 qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1); 1036 } 1037 } 1038 1039 pdu->data_len = data_len; 1040 1041 return (0); 1042 } 1043 1044 static void 1045 nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu) 1046 { 1047 m_freem(pdu->m); 1048 pdu->m = NULL; 1049 pdu->hdr = NULL; 1050 } 1051 1052 static int 1053 nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu) 1054 { 1055 const struct nvme_tcp_term_req_hdr *hdr; 1056 1057 hdr = (const void *)pdu->hdr; 1058 1059 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", 1060 le16toh(hdr->fes), le32dec(hdr->fei)); 1061 nvmf_che_free_pdu(pdu); 1062 return (ECONNRESET); 1063 } 1064 1065 static int 1066 nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp, 1067 struct nvmf_che_rxpdu *pdu) 1068 { 1069 const struct nvme_tcp_cmd *cmd; 1070 struct nvmf_capsule *nc; 1071 struct nvmf_che_capsule *cc; 1072 1073 cmd = (const void *)pdu->hdr; 1074 1075 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); 1076 1077 cc = CCAP(nc); 1078 cc->rx_pdu = *pdu; 1079 1080 nvmf_capsule_received(&qp->qp, nc); 1081 return (0); 1082 } 1083 1084 static int 1085 nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp, 1086 struct nvmf_che_rxpdu *pdu) 1087 { 1088 const struct nvme_tcp_rsp *rsp; 1089 struct nvme_completion cpl; 1090 struct nvmf_capsule *nc; 1091 struct nvmf_che_capsule *cc; 1092 uint16_t cid; 1093 1094 rsp = (const void *)pdu->hdr; 1095 1096 /* 1097 * Restore the original CID and ensure any command buffers 1098 * associated with this CID have been released. Once the CQE 1099 * has been received, no further transfers to the command 1100 * buffer for the associated CID can occur. 1101 */ 1102 cpl = rsp->rccqe; 1103 cid = le16toh(cpl.cid); 1104 if (CHE_TAG_IS_FL(cid)) { 1105 cid = CHE_RAW_FL_TAG(cid); 1106 mtx_lock(&qp->fl_cid_lock); 1107 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); 1108 cpl.cid = qp->fl_cids[cid]; 1109 FL_CID_FREE(cid, qp->fl_cid_set); 1110 mtx_unlock(&qp->fl_cid_lock); 1111 1112 che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid); 1113 che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid); 1114 } else { 1115 struct nvmf_che_command_buffer *cb; 1116 1117 mtx_lock(&qp->rx_buffers.lock); 1118 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; 1119 MPASS(cb != NULL); 1120 MPASS(cb->cid == rsp->rccqe.cid); 1121 cpl.cid = cb->original_cid; 1122 che_free_ddp_tag(qp, cb, cid); 1123 mtx_unlock(&qp->rx_buffers.lock); 1124 che_release_command_buffer(cb); 1125 } 1126 #ifdef VERBOSE_TRACES 1127 CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__, 1128 qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid); 1129 #endif 1130 1131 nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK); 1132 1133 nc->nc_sqhd_valid = true; 1134 cc = CCAP(nc); 1135 cc->rx_pdu = *pdu; 1136 1137 nvmf_capsule_received(&qp->qp, nc); 1138 return (0); 1139 } 1140 1141 /* 1142 * Construct a PDU that contains an optional data payload. This 1143 * includes dealing with the length fields in the common header. The 1144 * adapter inserts digests and padding when the PDU is transmitted. 1145 */ 1146 static struct mbuf * 1147 nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen, 1148 struct mbuf *data, uint32_t data_len) 1149 { 1150 struct nvme_tcp_common_pdu_hdr *ch; 1151 struct mbuf *top; 1152 uint32_t pdo, plen; 1153 uint8_t ulp_submode; 1154 1155 plen = hlen; 1156 if (qp->header_digests) 1157 plen += sizeof(uint32_t); 1158 if (data_len != 0) { 1159 KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); 1160 pdo = roundup(plen, qp->txpda); 1161 plen = pdo + data_len; 1162 if (qp->data_digests) 1163 plen += sizeof(uint32_t); 1164 } else { 1165 KASSERT(data == NULL, ("payload mbuf with zero length")); 1166 pdo = 0; 1167 } 1168 1169 top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR); 1170 top->m_len = hlen; 1171 top->m_pkthdr.len = hlen; 1172 ch = mtod(top, void *); 1173 memcpy(ch, hdr, hlen); 1174 ch->hlen = hlen; 1175 ulp_submode = 0; 1176 if (qp->header_digests) { 1177 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; 1178 ulp_submode |= ULP_CRC_HEADER; 1179 } 1180 if (qp->data_digests && data_len != 0) { 1181 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; 1182 ulp_submode |= ULP_CRC_DATA; 1183 } 1184 ch->pdo = pdo; 1185 ch->plen = htole32(plen); 1186 set_mbuf_ulp_submode(top, ulp_submode); 1187 1188 if (data_len != 0) { 1189 top->m_pkthdr.len += data_len; 1190 top->m_next = data; 1191 } 1192 1193 return (top); 1194 } 1195 1196 /* Allocate the next free freelist transfer tag. */ 1197 static bool 1198 nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp, 1199 struct nvmf_che_command_buffer *cb) 1200 { 1201 uint16_t ttag; 1202 1203 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1204 1205 if (qp->active_fl_ttags == qp->num_fl_ttags) 1206 return (false); 1207 1208 ttag = qp->next_fl_ttag; 1209 for (;;) { 1210 if (qp->open_fl_ttags[ttag] == NULL) 1211 break; 1212 if (ttag == qp->num_fl_ttags - 1) 1213 ttag = 0; 1214 else 1215 ttag++; 1216 MPASS(ttag != qp->next_fl_ttag); 1217 } 1218 if (ttag == qp->num_fl_ttags - 1) 1219 qp->next_fl_ttag = 0; 1220 else 1221 qp->next_fl_ttag = ttag + 1; 1222 1223 qp->active_fl_ttags++; 1224 qp->open_fl_ttags[ttag] = cb; 1225 1226 cb->ttag = ttag | CHE_FL_TAG_MASK; 1227 return (true); 1228 } 1229 1230 /* Attempt to allocate a free transfer tag and assign it to cb. */ 1231 static bool 1232 nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp, 1233 struct nvmf_che_command_buffer *cb) 1234 { 1235 uint16_t stag; 1236 1237 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1238 1239 stag = che_alloc_ddp_tag(qp, cb); 1240 if (stag == CHE_DDP_NO_TAG) { 1241 if (!nvmf_che_allocate_fl_ttag(qp, cb)) 1242 return (false); 1243 } else { 1244 cb->ttag = stag; 1245 } 1246 #ifdef VERBOSE_TRACES 1247 CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__, 1248 qp->toep->tid, cb->ttag); 1249 #endif 1250 cb->cc->active_r2ts++; 1251 return (true); 1252 } 1253 1254 /* Find the next command buffer eligible to schedule for R2T. */ 1255 static struct nvmf_che_command_buffer * 1256 nvmf_che_next_r2t(struct nvmf_che_qpair *qp) 1257 { 1258 struct nvmf_che_command_buffer *cb; 1259 1260 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1261 1262 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { 1263 /* NB: maxr2t is 0's based. */ 1264 if (cb->cc->active_r2ts > qp->maxr2t) 1265 continue; 1266 1267 if (!nvmf_che_allocate_ttag(qp, cb)) 1268 return (NULL); 1269 #ifdef INVARIANTS 1270 cb->cc->pending_r2ts--; 1271 #endif 1272 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); 1273 return (cb); 1274 } 1275 return (NULL); 1276 } 1277 1278 /* NB: cid and is little-endian already. */ 1279 static void 1280 che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, 1281 uint32_t data_offset, uint32_t data_len) 1282 { 1283 struct nvme_tcp_r2t_hdr r2t; 1284 struct mbuf *m; 1285 1286 memset(&r2t, 0, sizeof(r2t)); 1287 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; 1288 r2t.cccid = cid; 1289 r2t.ttag = htole16(ttag); 1290 r2t.r2to = htole32(data_offset); 1291 r2t.r2tl = htole32(data_len); 1292 1293 m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); 1294 nvmf_che_write_pdu(qp, m); 1295 } 1296 1297 /* 1298 * Release a transfer tag and schedule another R2T. 1299 * 1300 * NB: This drops the rx_buffers.lock mutex. 1301 */ 1302 static void 1303 nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp, 1304 struct nvmf_che_command_buffer *cb) 1305 { 1306 struct nvmf_che_command_buffer *ncb; 1307 1308 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1309 1310 #ifdef VERBOSE_TRACES 1311 CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid, 1312 cb->ttag); 1313 #endif 1314 if (CHE_TAG_IS_FL(cb->ttag)) { 1315 uint16_t ttag; 1316 1317 ttag = CHE_RAW_FL_TAG(cb->ttag); 1318 MPASS(qp->open_fl_ttags[ttag] == cb); 1319 1320 /* Release this transfer tag. */ 1321 qp->open_fl_ttags[ttag] = NULL; 1322 qp->active_fl_ttags--; 1323 } else 1324 che_free_ddp_tag(qp, cb, cb->ttag); 1325 1326 cb->cc->active_r2ts--; 1327 1328 /* Schedule another R2T. */ 1329 ncb = nvmf_che_next_r2t(qp); 1330 mtx_unlock(&qp->rx_buffers.lock); 1331 if (ncb != NULL) 1332 che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, 1333 ncb->data_len); 1334 } 1335 1336 /* 1337 * Copy len bytes starting at offset skip from an mbuf chain into an 1338 * I/O buffer at destination offset io_offset. 1339 */ 1340 static void 1341 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, 1342 struct nvmf_io_request *io, u_int io_offset) 1343 { 1344 u_int todo; 1345 1346 while (m->m_len <= skip) { 1347 skip -= m->m_len; 1348 m = m->m_next; 1349 } 1350 while (len != 0) { 1351 MPASS((m->m_flags & M_EXTPG) == 0); 1352 1353 todo = min(m->m_len - skip, len); 1354 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); 1355 skip = 0; 1356 io_offset += todo; 1357 len -= todo; 1358 m = m->m_next; 1359 } 1360 } 1361 1362 static int 1363 nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1364 { 1365 const struct nvme_tcp_h2c_data_hdr *h2c; 1366 struct nvmf_che_command_buffer *cb; 1367 uint32_t data_len, data_offset; 1368 uint16_t ttag, fl_ttag; 1369 1370 h2c = (const void *)pdu->hdr; 1371 if (le32toh(h2c->datal) > qp->maxh2cdata) { 1372 nvmf_che_report_error(qp, 1373 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, 1374 pdu->m, pdu->hdr->hlen); 1375 nvmf_che_free_pdu(pdu); 1376 return (EBADMSG); 1377 } 1378 1379 ttag = le16toh(h2c->ttag); 1380 if (CHE_TAG_IS_FL(ttag)) { 1381 fl_ttag = CHE_RAW_FL_TAG(ttag); 1382 if (fl_ttag >= qp->num_fl_ttags) { 1383 nvmf_che_report_error(qp, 1384 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1385 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), 1386 pdu->m, pdu->hdr->hlen); 1387 nvmf_che_free_pdu(pdu); 1388 return (EBADMSG); 1389 } 1390 1391 mtx_lock(&qp->rx_buffers.lock); 1392 cb = qp->open_fl_ttags[fl_ttag]; 1393 } else { 1394 if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) { 1395 nvmf_che_report_error(qp, 1396 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1397 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), 1398 pdu->m, pdu->hdr->hlen); 1399 nvmf_che_free_pdu(pdu); 1400 return (EBADMSG); 1401 } 1402 1403 mtx_lock(&qp->rx_buffers.lock); 1404 cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)]; 1405 } 1406 1407 if (cb == NULL) { 1408 mtx_unlock(&qp->rx_buffers.lock); 1409 nvmf_che_report_error(qp, 1410 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1411 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, 1412 pdu->hdr->hlen); 1413 nvmf_che_free_pdu(pdu); 1414 return (EBADMSG); 1415 } 1416 MPASS(cb->ttag == ttag); 1417 1418 /* For a data digest mismatch, fail the I/O request. */ 1419 if (pdu->data_digest_mismatch) { 1420 nvmf_che_send_next_r2t(qp, cb); 1421 cb->error = EINTEGRITY; 1422 che_release_command_buffer(cb); 1423 nvmf_che_free_pdu(pdu); 1424 return (0); 1425 } 1426 1427 data_len = le32toh(h2c->datal); 1428 if (data_len != pdu->data_len) { 1429 mtx_unlock(&qp->rx_buffers.lock); 1430 nvmf_che_report_error(qp, 1431 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1432 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, 1433 pdu->hdr->hlen); 1434 nvmf_che_free_pdu(pdu); 1435 return (EBADMSG); 1436 } 1437 1438 data_offset = le32toh(h2c->datao); 1439 if (data_offset < cb->data_offset || 1440 data_offset + data_len > cb->data_offset + cb->data_len) { 1441 mtx_unlock(&qp->rx_buffers.lock); 1442 nvmf_che_report_error(qp, 1443 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, 1444 pdu->hdr->hlen); 1445 nvmf_che_free_pdu(pdu); 1446 return (EBADMSG); 1447 } 1448 1449 if (data_offset != cb->data_offset + cb->data_xfered) { 1450 if (CHE_TAG_IS_FL(ttag)) { 1451 mtx_unlock(&qp->rx_buffers.lock); 1452 nvmf_che_report_error(qp, 1453 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1454 pdu->hdr->hlen); 1455 nvmf_che_free_pdu(pdu); 1456 return (EBADMSG); 1457 } else { 1458 uint32_t ddp_bytes; 1459 1460 /* Account for PDUs silently received via DDP. */ 1461 ddp_bytes = data_offset - 1462 (cb->data_offset + cb->data_xfered); 1463 cb->data_xfered += ddp_bytes; 1464 #ifdef VERBOSE_TRACES 1465 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", 1466 __func__, qp->toep->tid, ddp_bytes); 1467 #endif 1468 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, 1469 ddp_bytes); 1470 } 1471 } 1472 1473 if ((cb->data_xfered + data_len == cb->data_len) != 1474 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { 1475 mtx_unlock(&qp->rx_buffers.lock); 1476 nvmf_che_report_error(qp, 1477 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1478 pdu->hdr->hlen); 1479 nvmf_che_free_pdu(pdu); 1480 return (EBADMSG); 1481 } 1482 1483 cb->data_xfered += data_len; 1484 data_offset -= cb->data_offset; 1485 if (cb->data_xfered == cb->data_len) { 1486 nvmf_che_send_next_r2t(qp, cb); 1487 } else { 1488 che_hold_command_buffer(cb); 1489 mtx_unlock(&qp->rx_buffers.lock); 1490 } 1491 1492 if (CHE_TAG_IS_FL(ttag)) 1493 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, 1494 data_offset); 1495 1496 che_release_command_buffer(cb); 1497 nvmf_che_free_pdu(pdu); 1498 return (0); 1499 } 1500 1501 static int 1502 nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1503 { 1504 const struct nvme_tcp_c2h_data_hdr *c2h; 1505 struct nvmf_che_command_buffer *cb; 1506 uint32_t data_len, data_offset; 1507 uint16_t cid, original_cid; 1508 1509 /* 1510 * Unlike freelist command buffers, DDP command buffers are 1511 * not released until the response capsule is received to keep 1512 * the STAG allocated until the command has completed. 1513 */ 1514 c2h = (const void *)pdu->hdr; 1515 1516 cid = le16toh(c2h->cccid); 1517 if (CHE_TAG_IS_FL(cid)) { 1518 mtx_lock(&qp->rx_buffers.lock); 1519 cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid); 1520 } else { 1521 if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) { 1522 nvmf_che_report_error(qp, 1523 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1524 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), 1525 pdu->m, pdu->hdr->hlen); 1526 nvmf_che_free_pdu(pdu); 1527 return (EBADMSG); 1528 } 1529 1530 mtx_lock(&qp->rx_buffers.lock); 1531 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; 1532 } 1533 1534 if (cb == NULL) { 1535 mtx_unlock(&qp->rx_buffers.lock); 1536 /* 1537 * XXX: Could be PDU sequence error if cccid is for a 1538 * command that doesn't use a command buffer. 1539 */ 1540 nvmf_che_report_error(qp, 1541 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1542 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, 1543 pdu->hdr->hlen); 1544 nvmf_che_free_pdu(pdu); 1545 return (EBADMSG); 1546 } 1547 1548 /* For a data digest mismatch, fail the I/O request. */ 1549 if (pdu->data_digest_mismatch) { 1550 cb->error = EINTEGRITY; 1551 if (CHE_TAG_IS_FL(cid)) { 1552 che_remove_command_buffer(&qp->rx_buffers, cb); 1553 mtx_unlock(&qp->rx_buffers.lock); 1554 che_release_command_buffer(cb); 1555 } else 1556 mtx_unlock(&qp->rx_buffers.lock); 1557 nvmf_che_free_pdu(pdu); 1558 return (0); 1559 } 1560 1561 data_len = le32toh(c2h->datal); 1562 if (data_len != pdu->data_len) { 1563 mtx_unlock(&qp->rx_buffers.lock); 1564 nvmf_che_report_error(qp, 1565 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1566 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, 1567 pdu->hdr->hlen); 1568 nvmf_che_free_pdu(pdu); 1569 return (EBADMSG); 1570 } 1571 1572 data_offset = le32toh(c2h->datao); 1573 if (data_offset < cb->data_offset || 1574 data_offset + data_len > cb->data_offset + cb->data_len) { 1575 mtx_unlock(&qp->rx_buffers.lock); 1576 nvmf_che_report_error(qp, 1577 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 1578 pdu->m, pdu->hdr->hlen); 1579 nvmf_che_free_pdu(pdu); 1580 return (EBADMSG); 1581 } 1582 1583 if (data_offset != cb->data_offset + cb->data_xfered) { 1584 if (CHE_TAG_IS_FL(cid)) { 1585 mtx_unlock(&qp->rx_buffers.lock); 1586 nvmf_che_report_error(qp, 1587 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1588 pdu->hdr->hlen); 1589 nvmf_che_free_pdu(pdu); 1590 return (EBADMSG); 1591 } else { 1592 uint32_t ddp_bytes; 1593 1594 /* Account for PDUs silently received via DDP. */ 1595 ddp_bytes = data_offset - 1596 (cb->data_offset + cb->data_xfered); 1597 cb->data_xfered += ddp_bytes; 1598 #ifdef VERBOSE_TRACES 1599 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", 1600 __func__, qp->toep->tid, ddp_bytes); 1601 #endif 1602 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, 1603 ddp_bytes); 1604 } 1605 } 1606 1607 if ((cb->data_xfered + data_len == cb->data_len) != 1608 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { 1609 mtx_unlock(&qp->rx_buffers.lock); 1610 nvmf_che_report_error(qp, 1611 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1612 pdu->hdr->hlen); 1613 nvmf_che_free_pdu(pdu); 1614 return (EBADMSG); 1615 } 1616 1617 cb->data_xfered += data_len; 1618 original_cid = cb->original_cid; 1619 1620 if (CHE_TAG_IS_FL(cid)) { 1621 data_offset -= cb->data_offset; 1622 if (cb->data_xfered == cb->data_len) 1623 che_remove_command_buffer(&qp->rx_buffers, cb); 1624 else 1625 che_hold_command_buffer(cb); 1626 mtx_unlock(&qp->rx_buffers.lock); 1627 1628 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 1629 /* 1630 * Free the CID as the command has now been 1631 * completed. 1632 */ 1633 cid = CHE_RAW_FL_TAG(cid); 1634 mtx_lock(&qp->fl_cid_lock); 1635 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); 1636 MPASS(original_cid == qp->fl_cids[cid]); 1637 FL_CID_FREE(cid, qp->fl_cid_set); 1638 mtx_unlock(&qp->fl_cid_lock); 1639 } 1640 1641 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, 1642 data_offset); 1643 1644 che_release_command_buffer(cb); 1645 } else { 1646 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 1647 /* 1648 * Free the command buffer and STAG as the 1649 * command has now been completed. 1650 */ 1651 che_free_ddp_tag(qp, cb, cid); 1652 mtx_unlock(&qp->rx_buffers.lock); 1653 che_release_command_buffer(cb); 1654 } else 1655 mtx_unlock(&qp->rx_buffers.lock); 1656 } 1657 1658 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 1659 struct nvme_completion cqe; 1660 struct nvmf_capsule *nc; 1661 1662 memset(&cqe, 0, sizeof(cqe)); 1663 cqe.cid = original_cid; 1664 1665 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); 1666 nc->nc_sqhd_valid = false; 1667 1668 nvmf_capsule_received(&qp->qp, nc); 1669 } 1670 1671 nvmf_che_free_pdu(pdu); 1672 return (0); 1673 } 1674 1675 /* Called when m_free drops refcount to 0. */ 1676 static void 1677 nvmf_che_mbuf_done(struct mbuf *m) 1678 { 1679 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; 1680 1681 che_free_command_buffer(cb); 1682 } 1683 1684 static struct mbuf * 1685 nvmf_che_mbuf(void *arg, int how, void *data, size_t len) 1686 { 1687 struct nvmf_che_command_buffer *cb = arg; 1688 struct mbuf *m; 1689 1690 m = m_get(how, MT_DATA); 1691 m->m_flags |= M_RDONLY; 1692 m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL); 1693 m->m_len = len; 1694 return (m); 1695 } 1696 1697 static void 1698 nvmf_che_free_mext_pg(struct mbuf *m) 1699 { 1700 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; 1701 1702 M_ASSERTEXTPG(m); 1703 che_release_command_buffer(cb); 1704 } 1705 1706 static struct mbuf * 1707 nvmf_che_mext_pg(void *arg, int how) 1708 { 1709 struct nvmf_che_command_buffer *cb = arg; 1710 struct mbuf *m; 1711 1712 m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY); 1713 m->m_ext.ext_arg1 = cb; 1714 che_hold_command_buffer(cb); 1715 return (m); 1716 } 1717 1718 /* 1719 * Return an mbuf chain for a range of data belonging to a command 1720 * buffer. 1721 * 1722 * The mbuf chain uses M_EXT mbufs which hold references on the 1723 * command buffer so that it remains "alive" until the data has been 1724 * fully transmitted. If truncate_ok is true, then the mbuf chain 1725 * might return a short chain to avoid gratuitously splitting up a 1726 * page. 1727 */ 1728 static struct mbuf * 1729 nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb, 1730 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, 1731 bool can_truncate) 1732 { 1733 struct mbuf *m; 1734 size_t len; 1735 1736 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf, 1737 nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, 1738 can_truncate); 1739 if (actual_len != NULL) 1740 *actual_len = len; 1741 return (m); 1742 } 1743 1744 /* NB: cid and ttag and little-endian already. */ 1745 static void 1746 che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, 1747 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) 1748 { 1749 struct nvme_tcp_h2c_data_hdr h2c; 1750 struct mbuf *top; 1751 1752 memset(&h2c, 0, sizeof(h2c)); 1753 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; 1754 if (last_pdu) 1755 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; 1756 h2c.cccid = cid; 1757 h2c.ttag = ttag; 1758 h2c.datao = htole32(data_offset); 1759 h2c.datal = htole32(len); 1760 1761 top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len); 1762 nvmf_che_write_pdu(qp, top); 1763 } 1764 1765 static int 1766 nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1767 { 1768 const struct nvme_tcp_r2t_hdr *r2t; 1769 struct nvmf_che_command_buffer *cb; 1770 uint32_t data_len, data_offset; 1771 1772 r2t = (const void *)pdu->hdr; 1773 1774 mtx_lock(&qp->tx_buffers.lock); 1775 cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid); 1776 if (cb == NULL) { 1777 mtx_unlock(&qp->tx_buffers.lock); 1778 nvmf_che_report_error(qp, 1779 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1780 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, 1781 pdu->hdr->hlen); 1782 nvmf_che_free_pdu(pdu); 1783 return (EBADMSG); 1784 } 1785 1786 data_offset = le32toh(r2t->r2to); 1787 if (data_offset != cb->data_xfered) { 1788 mtx_unlock(&qp->tx_buffers.lock); 1789 nvmf_che_report_error(qp, 1790 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1791 pdu->hdr->hlen); 1792 nvmf_che_free_pdu(pdu); 1793 return (EBADMSG); 1794 } 1795 1796 /* 1797 * XXX: The spec does not specify how to handle R2T tranfers 1798 * out of range of the original command. 1799 */ 1800 data_len = le32toh(r2t->r2tl); 1801 if (data_offset + data_len > cb->data_len) { 1802 mtx_unlock(&qp->tx_buffers.lock); 1803 nvmf_che_report_error(qp, 1804 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 1805 pdu->m, pdu->hdr->hlen); 1806 nvmf_che_free_pdu(pdu); 1807 return (EBADMSG); 1808 } 1809 1810 cb->data_xfered += data_len; 1811 if (cb->data_xfered == cb->data_len) 1812 che_remove_command_buffer(&qp->tx_buffers, cb); 1813 else 1814 che_hold_command_buffer(cb); 1815 mtx_unlock(&qp->tx_buffers.lock); 1816 1817 /* 1818 * Queue one or more H2C_DATA PDUs containing the requested 1819 * data. 1820 */ 1821 while (data_len > 0) { 1822 struct mbuf *m; 1823 uint32_t sent, todo; 1824 1825 todo = min(data_len, qp->max_tx_data); 1826 m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent, 1827 todo < data_len); 1828 che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, 1829 sent, sent == data_len); 1830 1831 data_offset += sent; 1832 data_len -= sent; 1833 } 1834 1835 che_release_command_buffer(cb); 1836 nvmf_che_free_pdu(pdu); 1837 return (0); 1838 } 1839 1840 static int 1841 nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1842 { 1843 /* 1844 * The PDU header should always be contiguous in the mbuf from 1845 * CPL_NVMT_CMP. 1846 */ 1847 pdu->hdr = mtod(pdu->m, void *); 1848 KASSERT(pdu->m->m_len == pdu->hdr->hlen + 1849 ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ? 1850 sizeof(uint32_t) : 0), 1851 ("%s: mismatched PDU header mbuf length", __func__)); 1852 1853 switch (pdu->hdr->pdu_type) { 1854 default: 1855 __assert_unreachable(); 1856 break; 1857 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: 1858 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1859 return (nvmf_che_handle_term_req(pdu)); 1860 case NVME_TCP_PDU_TYPE_CAPSULE_CMD: 1861 return (nvmf_che_save_command_capsule(qp, pdu)); 1862 case NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1863 return (nvmf_che_save_response_capsule(qp, pdu)); 1864 case NVME_TCP_PDU_TYPE_H2C_DATA: 1865 return (nvmf_che_handle_h2c_data(qp, pdu)); 1866 case NVME_TCP_PDU_TYPE_C2H_DATA: 1867 return (nvmf_che_handle_c2h_data(qp, pdu)); 1868 case NVME_TCP_PDU_TYPE_R2T: 1869 return (nvmf_che_handle_r2t(qp, pdu)); 1870 } 1871 } 1872 1873 static int 1874 nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1875 { 1876 struct socket *so = qp->so; 1877 struct mbuf *m, *n; 1878 uint32_t tcp_seq; 1879 size_t len; 1880 int error; 1881 1882 /* Check for DDP data. */ 1883 if (pdu->ddp) { 1884 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1); 1885 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, 1886 pdu->data_len); 1887 return (0); 1888 } 1889 1890 error = 0; 1891 len = pdu->data_len; 1892 tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq; 1893 m = pdu->m; 1894 SOCKBUF_LOCK(&so->so_rcv); 1895 while (len > 0) { 1896 n = mbufq_dequeue(&qp->rx_data); 1897 KASSERT(n != NULL, ("%s: missing %zu data", __func__, len)); 1898 if (n == NULL) { 1899 error = ENOBUFS; 1900 break; 1901 } 1902 1903 KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq, 1904 ("%s: TCP seq mismatch", __func__)); 1905 KASSERT(n->m_pkthdr.len <= len, 1906 ("%s: too much data", __func__)); 1907 if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq || 1908 n->m_pkthdr.len > len) { 1909 m_freem(n); 1910 error = ENOBUFS; 1911 break; 1912 } 1913 1914 #ifdef VERBOSE_TRACES 1915 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, 1916 qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq); 1917 #endif 1918 pdu->m->m_pkthdr.len += n->m_pkthdr.len; 1919 len -= n->m_pkthdr.len; 1920 tcp_seq += n->m_pkthdr.len; 1921 m_demote_pkthdr(n); 1922 m->m_next = n; 1923 m = m_last(n); 1924 } 1925 SOCKBUF_UNLOCK(&so->so_rcv); 1926 1927 if (error == 0) { 1928 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1); 1929 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets, 1930 pdu->data_len); 1931 } 1932 return (error); 1933 } 1934 1935 static void 1936 nvmf_che_receive(void *arg) 1937 { 1938 struct nvmf_che_qpair *qp = arg; 1939 struct socket *so = qp->so; 1940 struct nvmf_che_rxpdu pdu; 1941 struct mbuf *m; 1942 int error, terror; 1943 1944 SOCKBUF_LOCK(&so->so_rcv); 1945 while (!qp->rx_shutdown) { 1946 /* Wait for a PDU. */ 1947 if (so->so_error != 0 || so->so_rerror != 0) { 1948 if (so->so_error != 0) 1949 error = so->so_error; 1950 else 1951 error = so->so_rerror; 1952 SOCKBUF_UNLOCK(&so->so_rcv); 1953 error: 1954 nvmf_qpair_error(&qp->qp, error); 1955 SOCKBUF_LOCK(&so->so_rcv); 1956 while (!qp->rx_shutdown) 1957 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1958 break; 1959 } 1960 1961 m = mbufq_dequeue(&qp->rx_pdus); 1962 if (m == NULL) { 1963 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { 1964 error = 0; 1965 SOCKBUF_UNLOCK(&so->so_rcv); 1966 goto error; 1967 } 1968 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1969 continue; 1970 } 1971 SOCKBUF_UNLOCK(&so->so_rcv); 1972 1973 pdu.m = m; 1974 pdu.hdr = mtod(m, const void *); 1975 pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0; 1976 1977 error = nvmf_che_validate_pdu(qp, &pdu); 1978 if (error == 0 && pdu.data_len != 0) 1979 error = nvmf_che_attach_pdu_data(qp, &pdu); 1980 if (error != 0) 1981 nvmf_che_free_pdu(&pdu); 1982 else 1983 error = nvmf_che_dispatch_pdu(qp, &pdu); 1984 if (error != 0) { 1985 /* 1986 * If we received a termination request, close 1987 * the connection immediately. 1988 */ 1989 if (error == ECONNRESET) 1990 goto error; 1991 1992 /* 1993 * Wait for up to 30 seconds for the socket to 1994 * be closed by the other end. 1995 */ 1996 SOCKBUF_LOCK(&so->so_rcv); 1997 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1998 terror = cv_timedwait(&qp->rx_cv, 1999 SOCKBUF_MTX(&so->so_rcv), 30 * hz); 2000 if (terror == ETIMEDOUT) 2001 printf("NVMe/TCP: Timed out after sending terminate request\n"); 2002 } 2003 SOCKBUF_UNLOCK(&so->so_rcv); 2004 goto error; 2005 } 2006 2007 SOCKBUF_LOCK(&so->so_rcv); 2008 } 2009 SOCKBUF_UNLOCK(&so->so_rcv); 2010 kthread_exit(); 2011 } 2012 2013 static int 2014 nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag) 2015 { 2016 struct nvmf_che_qpair *qp = arg; 2017 2018 cv_signal(&qp->rx_cv); 2019 return (SU_OK); 2020 } 2021 2022 static int 2023 do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2024 { 2025 struct adapter *sc = iq->adapter; 2026 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; 2027 const struct cpl_nvmt_data *cpl; 2028 u_int tid; 2029 struct toepcb *toep; 2030 struct nvmf_che_qpair *qp; 2031 struct socket *so; 2032 struct inpcb *inp; 2033 struct tcpcb *tp; 2034 int len __diagused; 2035 2036 if (nca->nvmt_data_iqe) { 2037 cpl = (const void *)(rss + 1); 2038 } else { 2039 cpl = mtod(m, const void *); 2040 2041 /* strip off CPL header */ 2042 m_adj(m, sizeof(*cpl)); 2043 } 2044 tid = GET_TID(cpl); 2045 toep = lookup_tid(sc, tid); 2046 2047 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 2048 2049 len = m->m_pkthdr.len; 2050 2051 KASSERT(len == be16toh(cpl->length), 2052 ("%s: payload length mismatch", __func__)); 2053 2054 inp = toep->inp; 2055 INP_WLOCK(inp); 2056 if (inp->inp_flags & INP_DROPPED) { 2057 CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 2058 __func__, tid, len, inp->inp_flags); 2059 INP_WUNLOCK(inp); 2060 m_freem(m); 2061 return (0); 2062 } 2063 2064 /* Save TCP sequence number. */ 2065 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); 2066 2067 qp = toep->ulpcb; 2068 so = qp->so; 2069 SOCKBUF_LOCK(&so->so_rcv); 2070 mbufq_enqueue(&qp->rx_data, m); 2071 SOCKBUF_UNLOCK(&so->so_rcv); 2072 2073 tp = intotcpcb(inp); 2074 tp->t_rcvtime = ticks; 2075 2076 #ifdef VERBOSE_TRACES 2077 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, 2078 be32toh(cpl->seq)); 2079 #endif 2080 2081 INP_WUNLOCK(inp); 2082 return (0); 2083 } 2084 2085 static int 2086 do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2087 { 2088 struct adapter *sc = iq->adapter; 2089 const struct cpl_nvmt_cmp *cpl = mtod(m, const void *); 2090 u_int tid = GET_TID(cpl); 2091 struct toepcb *toep = lookup_tid(sc, tid); 2092 struct nvmf_che_qpair *qp = toep->ulpcb; 2093 struct socket *so = qp->so; 2094 struct inpcb *inp = toep->inp; 2095 u_int hlen __diagused; 2096 bool empty; 2097 2098 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 2099 KASSERT(!(toep->flags & TPF_SYNQE), 2100 ("%s: toep %p claims to be a synq entry", __func__, toep)); 2101 2102 /* strip off CPL header */ 2103 m_adj(m, sizeof(*cpl)); 2104 hlen = m->m_pkthdr.len; 2105 2106 KASSERT(hlen == be16toh(cpl->length), 2107 ("%s: payload length mismatch", __func__)); 2108 2109 INP_WLOCK(inp); 2110 if (inp->inp_flags & INP_DROPPED) { 2111 CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), inp_flags 0x%x", 2112 __func__, tid, hlen, inp->inp_flags); 2113 INP_WUNLOCK(inp); 2114 m_freem(m); 2115 return (0); 2116 } 2117 2118 #ifdef VERBOSE_TRACES 2119 CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid, 2120 hlen, be32toh(cpl->seq), cpl->status); 2121 #endif 2122 2123 /* Save TCP sequence number and CPL status. */ 2124 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); 2125 m->m_pkthdr.nvmf_cpl_status = cpl->status; 2126 2127 SOCKBUF_LOCK(&so->so_rcv); 2128 empty = mbufq_len(&qp->rx_pdus) == 0; 2129 mbufq_enqueue(&qp->rx_pdus, m); 2130 SOCKBUF_UNLOCK(&so->so_rcv); 2131 INP_WUNLOCK(inp); 2132 if (empty) 2133 cv_signal(&qp->rx_cv); 2134 return (0); 2135 } 2136 2137 static uint16_t 2138 che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid) 2139 { 2140 uint16_t new_cid; 2141 2142 mtx_lock(&qp->fl_cid_lock); 2143 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid); 2144 if (new_cid == 0) { 2145 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0); 2146 MPASS(new_cid != 0); 2147 } 2148 new_cid--; 2149 FL_CID_BUSY(new_cid, qp->fl_cid_set); 2150 if (new_cid == CHE_MAX_FL_TAG) 2151 qp->next_cid = 0; 2152 else 2153 qp->next_cid = new_cid + 1; 2154 qp->fl_cids[new_cid] = original_cid; 2155 mtx_unlock(&qp->fl_cid_lock); 2156 2157 return (new_cid | CHE_FL_TAG_MASK); 2158 } 2159 2160 static uint16_t 2161 che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) 2162 { 2163 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 2164 2165 return (che_alloc_ddp_tag(qp, cb)); 2166 } 2167 2168 static struct mbuf * 2169 che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) 2170 { 2171 struct nvmf_capsule *nc = &cc->nc; 2172 struct nvmf_che_command_buffer *cb; 2173 struct nvme_sgl_descriptor *sgl; 2174 struct nvme_tcp_cmd cmd; 2175 struct mbuf *top, *m; 2176 uint16_t cid; 2177 bool use_icd; 2178 2179 use_icd = false; 2180 cb = NULL; 2181 m = NULL; 2182 2183 if (nc->nc_data.io_len != 0) { 2184 cb = che_alloc_command_buffer(qp, &nc->nc_data, 0, 2185 nc->nc_data.io_len, nc->nc_sqe.cid); 2186 cb->original_cid = nc->nc_sqe.cid; 2187 2188 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { 2189 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2190 use_icd = true; 2191 m = nvmf_che_command_buffer_mbuf(cb, 0, 2192 nc->nc_data.io_len, NULL, false); 2193 cb->data_xfered = nc->nc_data.io_len; 2194 che_release_command_buffer(cb); 2195 } else if (nc->nc_send_data) { 2196 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2197 cb->cid = htole16(cid); 2198 mtx_lock(&qp->tx_buffers.lock); 2199 che_add_command_buffer(&qp->tx_buffers, cb); 2200 mtx_unlock(&qp->tx_buffers.lock); 2201 } else { 2202 mtx_lock(&qp->rx_buffers.lock); 2203 cid = che_alloc_ddp_cid(qp, cb); 2204 if (cid == CHE_DDP_NO_TAG) { 2205 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2206 che_add_command_buffer(&qp->rx_buffers, cb); 2207 } 2208 cb->cid = htole16(cid); 2209 mtx_unlock(&qp->rx_buffers.lock); 2210 } 2211 } else 2212 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2213 2214 #ifdef VERBOSE_TRACES 2215 CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__, 2216 qp->toep->tid, cid, nc->nc_sqe.cid); 2217 #endif 2218 memset(&cmd, 0, sizeof(cmd)); 2219 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; 2220 cmd.ccsqe = nc->nc_sqe; 2221 cmd.ccsqe.cid = htole16(cid); 2222 2223 /* Populate SGL in SQE. */ 2224 sgl = &cmd.ccsqe.sgl; 2225 memset(sgl, 0, sizeof(*sgl)); 2226 sgl->address = 0; 2227 sgl->length = htole32(nc->nc_data.io_len); 2228 if (use_icd) { 2229 /* Use in-capsule data. */ 2230 sgl->type = NVME_SGL_TYPE_ICD; 2231 } else { 2232 /* Use a command buffer. */ 2233 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; 2234 } 2235 2236 top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? 2237 nc->nc_data.io_len : 0); 2238 return (top); 2239 } 2240 2241 static struct mbuf * 2242 che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) 2243 { 2244 struct nvmf_capsule *nc = &cc->nc; 2245 struct nvme_tcp_rsp rsp; 2246 2247 memset(&rsp, 0, sizeof(rsp)); 2248 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; 2249 rsp.rccqe = nc->nc_cqe; 2250 2251 return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); 2252 } 2253 2254 static struct mbuf * 2255 capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) 2256 { 2257 if (cc->nc.nc_qe_len == sizeof(struct nvme_command)) 2258 return (che_command_pdu(qp, cc)); 2259 else 2260 return (che_response_pdu(qp, cc)); 2261 } 2262 2263 static void 2264 nvmf_che_send(void *arg) 2265 { 2266 struct nvmf_che_qpair *qp = arg; 2267 struct nvmf_che_capsule *cc; 2268 struct socket *so = qp->so; 2269 struct mbuf *m; 2270 int error; 2271 2272 m = NULL; 2273 SOCKBUF_LOCK(&so->so_snd); 2274 while (!qp->tx_shutdown) { 2275 if (so->so_error != 0) { 2276 error = so->so_error; 2277 SOCKBUF_UNLOCK(&so->so_snd); 2278 m_freem(m); 2279 nvmf_qpair_error(&qp->qp, error); 2280 SOCKBUF_LOCK(&so->so_snd); 2281 while (!qp->tx_shutdown) 2282 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 2283 break; 2284 } 2285 2286 if (STAILQ_EMPTY(&qp->tx_capsules)) { 2287 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 2288 continue; 2289 } 2290 2291 /* Convert a capsule into a PDU. */ 2292 cc = STAILQ_FIRST(&qp->tx_capsules); 2293 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); 2294 SOCKBUF_UNLOCK(&so->so_snd); 2295 2296 m = capsule_to_pdu(qp, cc); 2297 che_release_capsule(cc); 2298 2299 nvmf_che_write_pdu(qp, m); 2300 2301 SOCKBUF_LOCK(&so->so_snd); 2302 } 2303 SOCKBUF_UNLOCK(&so->so_snd); 2304 kthread_exit(); 2305 } 2306 2307 static int 2308 nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace) 2309 { 2310 struct sockopt opt; 2311 int error, one = 1; 2312 2313 /* Don't lower the buffer sizes, just enforce a minimum. */ 2314 SOCKBUF_LOCK(&so->so_snd); 2315 if (sspace < so->so_snd.sb_hiwat) 2316 sspace = so->so_snd.sb_hiwat; 2317 SOCKBUF_UNLOCK(&so->so_snd); 2318 SOCKBUF_LOCK(&so->so_rcv); 2319 if (rspace < so->so_rcv.sb_hiwat) 2320 rspace = so->so_rcv.sb_hiwat; 2321 SOCKBUF_UNLOCK(&so->so_rcv); 2322 2323 error = soreserve(so, sspace, rspace); 2324 if (error != 0) 2325 return (error); 2326 SOCKBUF_LOCK(&so->so_snd); 2327 so->so_snd.sb_flags |= SB_AUTOSIZE; 2328 SOCKBUF_UNLOCK(&so->so_snd); 2329 SOCKBUF_LOCK(&so->so_rcv); 2330 so->so_rcv.sb_flags |= SB_AUTOSIZE; 2331 SOCKBUF_UNLOCK(&so->so_rcv); 2332 2333 /* 2334 * Disable Nagle. 2335 */ 2336 bzero(&opt, sizeof(opt)); 2337 opt.sopt_dir = SOPT_SET; 2338 opt.sopt_level = IPPROTO_TCP; 2339 opt.sopt_name = TCP_NODELAY; 2340 opt.sopt_val = &one; 2341 opt.sopt_valsize = sizeof(one); 2342 error = sosetopt(so, &opt); 2343 if (error != 0) 2344 return (error); 2345 2346 return (0); 2347 } 2348 2349 static void 2350 t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, 2351 uint64_t val) 2352 { 2353 struct adapter *sc = td_adapter(toep->td); 2354 2355 t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0); 2356 } 2357 2358 static void 2359 set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda) 2360 { 2361 uint64_t val; 2362 2363 CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u", 2364 __func__, toep->tid, ulp_submode, rxpda); 2365 2366 val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode); 2367 t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE, 2368 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val); 2369 2370 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); 2371 t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val); 2372 2373 val = V_TCB_RSVD((rxpda / 4) - 1); 2374 t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val); 2375 2376 /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */ 2377 val = 0; 2378 t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ, 2379 V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val); 2380 } 2381 2382 static u_int 2383 pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen, 2384 uint8_t pda) 2385 { 2386 u_int max_data_len; 2387 2388 if (nvlist_get_bool(nvl, "header_digests")) 2389 hlen += sizeof(uint32_t); 2390 hlen = roundup(hlen, pda); 2391 max_data_len = max_pdu_len - hlen; 2392 if (nvlist_get_bool(nvl, "data_digests")) 2393 max_data_len -= sizeof(uint32_t); 2394 return (max_data_len); 2395 } 2396 2397 static struct nvmf_qpair * 2398 che_allocate_qpair(bool controller, const nvlist_t *nvl) 2399 { 2400 struct nvmf_che_adapter *nca; 2401 struct nvmf_che_qpair *qp; 2402 struct adapter *sc; 2403 struct file *fp; 2404 struct socket *so; 2405 struct inpcb *inp; 2406 struct tcpcb *tp; 2407 struct toepcb *toep; 2408 cap_rights_t rights; 2409 u_int max_tx_pdu_len, num_ddp_tags; 2410 int error, ulp_submode; 2411 2412 if (!nvlist_exists_number(nvl, "fd") || 2413 !nvlist_exists_number(nvl, "rxpda") || 2414 !nvlist_exists_number(nvl, "txpda") || 2415 !nvlist_exists_bool(nvl, "header_digests") || 2416 !nvlist_exists_bool(nvl, "data_digests") || 2417 !nvlist_exists_number(nvl, "maxr2t") || 2418 !nvlist_exists_number(nvl, "maxh2cdata") || 2419 !nvlist_exists_number(nvl, "max_icd")) 2420 return (NULL); 2421 2422 error = fget(curthread, nvlist_get_number(nvl, "fd"), 2423 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); 2424 if (error != 0) 2425 return (NULL); 2426 if (fp->f_type != DTYPE_SOCKET) { 2427 fdrop(fp, curthread); 2428 return (NULL); 2429 } 2430 so = fp->f_data; 2431 if (so->so_type != SOCK_STREAM || 2432 so->so_proto->pr_protocol != IPPROTO_TCP) { 2433 fdrop(fp, curthread); 2434 return (NULL); 2435 } 2436 2437 sc = find_offload_adapter(so); 2438 if (sc == NULL) { 2439 fdrop(fp, curthread); 2440 return (NULL); 2441 } 2442 nca = sc->nvme_ulp_softc; 2443 2444 /* 2445 * Controller: Require advertised MAXH2CDATA to be small 2446 * enough. 2447 */ 2448 if (controller) { 2449 u_int max_rx_data; 2450 2451 max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, 2452 sizeof(struct nvme_tcp_h2c_data_hdr), 2453 nvlist_get_number(nvl, "rxpda")); 2454 if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) { 2455 fdrop(fp, curthread); 2456 return (NULL); 2457 } 2458 } 2459 2460 /* 2461 * Host: Require the queue size to be small enough that all of 2462 * the command ids allocated by nvmf(4) will fit in the 2463 * unallocated range. 2464 * 2465 * XXX: Alternatively this driver could just queue commands 2466 * when an unallocated ID isn't available. 2467 */ 2468 if (!controller) { 2469 u_int num_commands; 2470 2471 num_commands = nvlist_get_number(nvl, "qsize") - 1; 2472 if (nvlist_get_bool(nvl, "admin")) 2473 num_commands += 8; /* Max AER */ 2474 if (num_commands > CHE_NUM_FL_TAGS) { 2475 fdrop(fp, curthread); 2476 return (NULL); 2477 } 2478 } 2479 2480 qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO); 2481 qp->txpda = nvlist_get_number(nvl, "txpda"); 2482 qp->rxpda = nvlist_get_number(nvl, "rxpda"); 2483 qp->header_digests = nvlist_get_bool(nvl, "header_digests"); 2484 qp->data_digests = nvlist_get_bool(nvl, "data_digests"); 2485 qp->maxr2t = nvlist_get_number(nvl, "maxr2t"); 2486 if (controller) 2487 qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata"); 2488 2489 if (controller) { 2490 /* NB: maxr2t is 0's based. */ 2491 qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS, 2492 nvlist_get_number(nvl, "qsize") * 2493 ((uint64_t)qp->maxr2t + 1)); 2494 qp->open_fl_ttags = mallocarray(qp->num_fl_ttags, 2495 sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO); 2496 } else { 2497 qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS, 2498 sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO); 2499 qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE, 2500 M_WAITOK); 2501 FL_CID_INIT(qp->fl_cid_set); 2502 mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF); 2503 } 2504 2505 inp = sotoinpcb(so); 2506 INP_WLOCK(inp); 2507 tp = intotcpcb(inp); 2508 if (inp->inp_flags & INP_DROPPED) { 2509 INP_WUNLOCK(inp); 2510 free(qp->fl_cid_set, M_NVMF_CHE); 2511 free(qp->fl_cids, M_NVMF_CHE); 2512 free(qp->open_fl_ttags, M_NVMF_CHE); 2513 free(qp, M_NVMF_CHE); 2514 fdrop(fp, curthread); 2515 return (NULL); 2516 } 2517 2518 MPASS(tp->t_flags & TF_TOE); 2519 MPASS(tp->tod != NULL); 2520 MPASS(tp->t_toe != NULL); 2521 toep = tp->t_toe; 2522 MPASS(toep->vi->adapter == sc); 2523 2524 if (ulp_mode(toep) != ULP_MODE_NONE) { 2525 INP_WUNLOCK(inp); 2526 free(qp->fl_cid_set, M_NVMF_CHE); 2527 free(qp->fl_cids, M_NVMF_CHE); 2528 free(qp->open_fl_ttags, M_NVMF_CHE); 2529 free(qp, M_NVMF_CHE); 2530 fdrop(fp, curthread); 2531 return (NULL); 2532 } 2533 2534 /* Claim socket from file descriptor. */ 2535 fp->f_ops = &badfileops; 2536 fp->f_data = NULL; 2537 2538 qp->so = so; 2539 qp->toep = toep; 2540 qp->nca = nca; 2541 refcount_init(&qp->refs, 1); 2542 2543 /* NB: C2H and H2C headers are the same size. */ 2544 qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, 2545 sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda); 2546 qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu, 2547 sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda); 2548 if (!controller) { 2549 qp->max_tx_data = min(qp->max_tx_data, 2550 nvlist_get_number(nvl, "maxh2cdata")); 2551 qp->max_icd = min(nvlist_get_number(nvl, "max_icd"), 2552 pdu_max_data_len(nvl, nca->max_transmit_pdu, 2553 sizeof(struct nvme_tcp_cmd), qp->txpda)); 2554 } else { 2555 /* 2556 * IOCCSZ represents the size of a logical command 2557 * capsule including the 64 byte SQE and the 2558 * in-capsule data. Use pdu_max_data_len to compute 2559 * the maximum supported ICD length. 2560 */ 2561 qp->max_ioccsz = rounddown(pdu_max_data_len(nvl, 2562 nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd), 2563 qp->rxpda), 16) + sizeof(struct nvme_command); 2564 } 2565 2566 ulp_submode = 0; 2567 if (qp->header_digests) 2568 ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC; 2569 if (qp->data_digests) 2570 ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC; 2571 if (!controller) 2572 ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR; 2573 2574 max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr); 2575 if (qp->header_digests) 2576 max_tx_pdu_len += sizeof(uint32_t); 2577 max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda); 2578 max_tx_pdu_len += qp->max_tx_data; 2579 if (qp->data_digests) 2580 max_tx_pdu_len += sizeof(uint32_t); 2581 2582 /* TODO: ISO limits */ 2583 2584 if (controller) { 2585 /* Use the SUCCESS flag if SQ flow control is disabled. */ 2586 qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control"); 2587 } 2588 2589 toep->params.ulp_mode = ULP_MODE_NVMET; 2590 toep->ulpcb = qp; 2591 2592 send_txdataplen_max_flowc_wr(sc, toep, 2593 roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg)); 2594 set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda); 2595 INP_WUNLOCK(inp); 2596 2597 fdrop(fp, curthread); 2598 2599 error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu); 2600 if (error != 0) { 2601 free(qp->fl_cid_set, M_NVMF_CHE); 2602 free(qp->fl_cids, M_NVMF_CHE); 2603 free(qp->open_fl_ttags, M_NVMF_CHE); 2604 free(qp, M_NVMF_CHE); 2605 return (NULL); 2606 } 2607 2608 num_ddp_tags = ddp_tags_per_qp; 2609 if (num_ddp_tags > 0) { 2610 qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags); 2611 if (qp->tpt_offset != T4_STAG_UNSET) { 2612 #ifdef VERBOSE_TRACES 2613 CTR(KTR_CXGBE, 2614 "%s: tid %u using %u tags at offset 0x%x", 2615 __func__, toep->tid, num_ddp_tags, qp->tpt_offset); 2616 #endif 2617 qp->num_ddp_tags = num_ddp_tags; 2618 qp->open_ddp_tags = mallocarray(qp->num_ddp_tags, 2619 sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK | 2620 M_ZERO); 2621 2622 t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET, 2623 M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset)); 2624 } 2625 } 2626 2627 TAILQ_INIT(&qp->rx_buffers.head); 2628 TAILQ_INIT(&qp->tx_buffers.head); 2629 mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF); 2630 mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF); 2631 2632 cv_init(&qp->rx_cv, "-"); 2633 cv_init(&qp->tx_cv, "-"); 2634 mbufq_init(&qp->rx_data, 0); 2635 mbufq_init(&qp->rx_pdus, 0); 2636 STAILQ_INIT(&qp->tx_capsules); 2637 2638 /* Register socket upcall for receive to handle remote FIN. */ 2639 SOCKBUF_LOCK(&so->so_rcv); 2640 soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp); 2641 SOCKBUF_UNLOCK(&so->so_rcv); 2642 2643 /* Spin up kthreads. */ 2644 error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0, 2645 "nvmef che rx"); 2646 if (error != 0) { 2647 che_free_qpair(&qp->qp); 2648 return (NULL); 2649 } 2650 error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0, 2651 "nvmef che tx"); 2652 if (error != 0) { 2653 che_free_qpair(&qp->qp); 2654 return (NULL); 2655 } 2656 2657 return (&qp->qp); 2658 } 2659 2660 static void 2661 che_release_qpair(struct nvmf_che_qpair *qp) 2662 { 2663 if (refcount_release(&qp->refs)) 2664 free(qp, M_NVMF_CHE); 2665 } 2666 2667 static void 2668 che_free_qpair(struct nvmf_qpair *nq) 2669 { 2670 struct nvmf_che_qpair *qp = CQP(nq); 2671 struct nvmf_che_command_buffer *ncb, *cb; 2672 struct nvmf_che_capsule *ncc, *cc; 2673 struct socket *so = qp->so; 2674 struct toepcb *toep = qp->toep; 2675 struct inpcb *inp = sotoinpcb(so); 2676 2677 /* Shut down kthreads. */ 2678 SOCKBUF_LOCK(&so->so_snd); 2679 qp->tx_shutdown = true; 2680 if (qp->tx_thread != NULL) { 2681 cv_signal(&qp->tx_cv); 2682 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, 2683 "nvchetx", 0); 2684 } 2685 SOCKBUF_UNLOCK(&so->so_snd); 2686 2687 SOCKBUF_LOCK(&so->so_rcv); 2688 qp->rx_shutdown = true; 2689 if (qp->rx_thread != NULL) { 2690 cv_signal(&qp->rx_cv); 2691 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, 2692 "nvcherx", 0); 2693 } 2694 soupcall_clear(so, SO_RCV); 2695 SOCKBUF_UNLOCK(&so->so_rcv); 2696 mbufq_drain(&qp->rx_data); 2697 mbufq_drain(&qp->rx_pdus); 2698 2699 STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) { 2700 nvmf_abort_capsule_data(&cc->nc, ECONNABORTED); 2701 che_release_capsule(cc); 2702 } 2703 2704 cv_destroy(&qp->tx_cv); 2705 cv_destroy(&qp->rx_cv); 2706 2707 if (qp->open_fl_ttags != NULL) { 2708 for (u_int i = 0; i < qp->num_fl_ttags; i++) { 2709 cb = qp->open_fl_ttags[i]; 2710 if (cb != NULL) { 2711 cb->cc->active_r2ts--; 2712 cb->error = ECONNABORTED; 2713 che_release_command_buffer(cb); 2714 } 2715 } 2716 free(qp->open_fl_ttags, M_NVMF_CHE); 2717 } 2718 if (qp->num_ddp_tags != 0) { 2719 for (u_int i = 0; i < qp->num_ddp_tags; i++) { 2720 cb = qp->open_ddp_tags[i]; 2721 if (cb != NULL) { 2722 if (cb->cc != NULL) 2723 cb->cc->active_r2ts--; 2724 cb->error = ECONNABORTED; 2725 mtx_lock(&qp->rx_buffers.lock); 2726 che_free_ddp_tag(qp, cb, cb->ttag); 2727 mtx_unlock(&qp->rx_buffers.lock); 2728 che_release_command_buffer(cb); 2729 } 2730 } 2731 free(qp->open_ddp_tags, M_NVMF_CHE); 2732 } 2733 2734 mtx_lock(&qp->rx_buffers.lock); 2735 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { 2736 che_remove_command_buffer(&qp->rx_buffers, cb); 2737 mtx_unlock(&qp->rx_buffers.lock); 2738 #ifdef INVARIANTS 2739 if (cb->cc != NULL) 2740 cb->cc->pending_r2ts--; 2741 #endif 2742 cb->error = ECONNABORTED; 2743 che_release_command_buffer(cb); 2744 mtx_lock(&qp->rx_buffers.lock); 2745 } 2746 mtx_destroy(&qp->rx_buffers.lock); 2747 2748 mtx_lock(&qp->tx_buffers.lock); 2749 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { 2750 che_remove_command_buffer(&qp->tx_buffers, cb); 2751 mtx_unlock(&qp->tx_buffers.lock); 2752 cb->error = ECONNABORTED; 2753 che_release_command_buffer(cb); 2754 mtx_lock(&qp->tx_buffers.lock); 2755 } 2756 mtx_destroy(&qp->tx_buffers.lock); 2757 2758 if (qp->num_ddp_tags != 0) 2759 t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags); 2760 2761 if (!qp->qp.nq_controller) { 2762 free(qp->fl_cids, M_NVMF_CHE); 2763 free(qp->fl_cid_set, M_NVMF_CHE); 2764 mtx_destroy(&qp->fl_cid_lock); 2765 } 2766 2767 INP_WLOCK(inp); 2768 toep->ulpcb = NULL; 2769 mbufq_drain(&toep->ulp_pduq); 2770 2771 /* 2772 * Grab a reference to use when waiting for the final CPL to 2773 * be received. If toep->inp is NULL, then 2774 * final_cpl_received() has already been called (e.g. due to 2775 * the peer sending a RST). 2776 */ 2777 if (toep->inp != NULL) { 2778 toep = hold_toepcb(toep); 2779 toep->flags |= TPF_WAITING_FOR_FINAL; 2780 } else 2781 toep = NULL; 2782 INP_WUNLOCK(inp); 2783 2784 soclose(so); 2785 2786 /* 2787 * Wait for the socket to fully close. This ensures any 2788 * pending received data has been received (and in particular, 2789 * any data that would be received by DDP has been handled). 2790 */ 2791 if (toep != NULL) { 2792 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); 2793 2794 mtx_lock(lock); 2795 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) 2796 mtx_sleep(toep, lock, PSOCK, "conclo2", 0); 2797 mtx_unlock(lock); 2798 free_toepcb(toep); 2799 } 2800 2801 che_release_qpair(qp); 2802 } 2803 2804 static uint32_t 2805 che_max_ioccsz(struct nvmf_qpair *nq) 2806 { 2807 struct nvmf_che_qpair *qp = CQP(nq); 2808 2809 /* 2810 * Limit the command capsule size so that with maximum ICD it 2811 * fits within the limit of the largest PDU the adapter can 2812 * receive. 2813 */ 2814 return (qp->max_ioccsz); 2815 } 2816 2817 static uint64_t 2818 che_max_xfer_size(struct nvmf_qpair *nq) 2819 { 2820 struct nvmf_che_qpair *qp = CQP(nq); 2821 2822 /* 2823 * Limit host transfers to the size of the data payload in the 2824 * largest PDU the adapter can receive. 2825 */ 2826 return (qp->max_rx_data); 2827 } 2828 2829 static struct nvmf_capsule * 2830 che_allocate_capsule(struct nvmf_qpair *nq, int how) 2831 { 2832 struct nvmf_che_qpair *qp = CQP(nq); 2833 struct nvmf_che_capsule *cc; 2834 2835 cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO); 2836 if (cc == NULL) 2837 return (NULL); 2838 refcount_init(&cc->refs, 1); 2839 refcount_acquire(&qp->refs); 2840 return (&cc->nc); 2841 } 2842 2843 static void 2844 che_release_capsule(struct nvmf_che_capsule *cc) 2845 { 2846 struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair); 2847 2848 if (!refcount_release(&cc->refs)) 2849 return; 2850 2851 MPASS(cc->active_r2ts == 0); 2852 MPASS(cc->pending_r2ts == 0); 2853 2854 nvmf_che_free_pdu(&cc->rx_pdu); 2855 free(cc, M_NVMF_CHE); 2856 che_release_qpair(qp); 2857 } 2858 2859 static void 2860 che_free_capsule(struct nvmf_capsule *nc) 2861 { 2862 che_release_capsule(CCAP(nc)); 2863 } 2864 2865 static int 2866 che_transmit_capsule(struct nvmf_capsule *nc) 2867 { 2868 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); 2869 struct nvmf_che_capsule *cc = CCAP(nc); 2870 struct socket *so = qp->so; 2871 2872 refcount_acquire(&cc->refs); 2873 SOCKBUF_LOCK(&so->so_snd); 2874 STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link); 2875 cv_signal(&qp->tx_cv); 2876 SOCKBUF_UNLOCK(&so->so_snd); 2877 return (0); 2878 } 2879 2880 static uint8_t 2881 che_validate_command_capsule(struct nvmf_capsule *nc) 2882 { 2883 struct nvmf_che_capsule *cc = CCAP(nc); 2884 struct nvme_sgl_descriptor *sgl; 2885 2886 KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received")); 2887 2888 sgl = &nc->nc_sqe.sgl; 2889 switch (sgl->type) { 2890 case NVME_SGL_TYPE_ICD: 2891 if (cc->rx_pdu.data_len != le32toh(sgl->length)) { 2892 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); 2893 return (NVME_SC_DATA_SGL_LENGTH_INVALID); 2894 } 2895 break; 2896 case NVME_SGL_TYPE_COMMAND_BUFFER: 2897 if (cc->rx_pdu.data_len != 0) { 2898 printf("NVMe/TCP: Command Buffer SGL with ICD\n"); 2899 return (NVME_SC_INVALID_FIELD); 2900 } 2901 break; 2902 default: 2903 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); 2904 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); 2905 } 2906 2907 if (sgl->address != 0) { 2908 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); 2909 return (NVME_SC_SGL_OFFSET_INVALID); 2910 } 2911 2912 return (NVME_SC_SUCCESS); 2913 } 2914 2915 static size_t 2916 che_capsule_data_len(const struct nvmf_capsule *nc) 2917 { 2918 MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); 2919 return (le32toh(nc->nc_sqe.sgl.length)); 2920 } 2921 2922 static void 2923 che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, 2924 struct nvmf_io_request *io) 2925 { 2926 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); 2927 struct nvmf_che_capsule *cc = CCAP(nc); 2928 struct nvmf_che_command_buffer *cb; 2929 2930 cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len, 2931 nc->nc_sqe.cid); 2932 2933 cb->cc = cc; 2934 refcount_acquire(&cc->refs); 2935 2936 /* 2937 * If this command has too many active R2Ts or there are no 2938 * available transfer tags, queue the request for later. 2939 * 2940 * NB: maxr2t is 0's based. 2941 */ 2942 mtx_lock(&qp->rx_buffers.lock); 2943 if (cc->active_r2ts > qp->maxr2t || 2944 !nvmf_che_allocate_ttag(qp, cb)) { 2945 #ifdef INVARIANTS 2946 cc->pending_r2ts++; 2947 #endif 2948 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); 2949 mtx_unlock(&qp->rx_buffers.lock); 2950 return; 2951 } 2952 mtx_unlock(&qp->rx_buffers.lock); 2953 2954 che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); 2955 } 2956 2957 static void 2958 che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, 2959 struct nvmf_io_request *io) 2960 { 2961 struct nvmf_che_capsule *cc = CCAP(nc); 2962 2963 /* 2964 * The header is in rx_pdu.m, the padding is discarded, and 2965 * the data starts at rx_pdu.m->m_next. 2966 */ 2967 mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0); 2968 nvmf_complete_io_request(io, io->io_len, 0); 2969 } 2970 2971 static int 2972 che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 2973 struct nvmf_io_request *io) 2974 { 2975 struct nvme_sgl_descriptor *sgl; 2976 size_t data_len; 2977 2978 if (nc->nc_qe_len != sizeof(struct nvme_command) || 2979 !nc->nc_qpair->nq_controller) 2980 return (EINVAL); 2981 2982 sgl = &nc->nc_sqe.sgl; 2983 data_len = le32toh(sgl->length); 2984 if (data_offset + io->io_len > data_len) 2985 return (EFBIG); 2986 2987 if (sgl->type == NVME_SGL_TYPE_ICD) 2988 che_receive_icd_data(nc, data_offset, io); 2989 else 2990 che_receive_r2t_data(nc, data_offset, io); 2991 return (0); 2992 } 2993 2994 /* NB: cid is little-endian already. */ 2995 static void 2996 che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset, 2997 struct mbuf *m, size_t len, bool last_pdu, bool success) 2998 { 2999 struct nvme_tcp_c2h_data_hdr c2h; 3000 struct mbuf *top; 3001 3002 memset(&c2h, 0, sizeof(c2h)); 3003 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; 3004 if (last_pdu) 3005 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; 3006 if (success) 3007 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; 3008 c2h.cccid = cid; 3009 c2h.datao = htole32(data_offset); 3010 c2h.datal = htole32(len); 3011 3012 top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len); 3013 nvmf_che_write_pdu(qp, top); 3014 } 3015 3016 static u_int 3017 che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 3018 struct mbuf *m, size_t len) 3019 { 3020 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); 3021 struct nvme_sgl_descriptor *sgl; 3022 uint32_t data_len; 3023 bool last_pdu, last_xfer; 3024 3025 if (nc->nc_qe_len != sizeof(struct nvme_command) || 3026 !qp->qp.nq_controller) { 3027 m_freem(m); 3028 return (NVME_SC_INVALID_FIELD); 3029 } 3030 3031 sgl = &nc->nc_sqe.sgl; 3032 data_len = le32toh(sgl->length); 3033 if (data_offset + len > data_len) { 3034 m_freem(m); 3035 return (NVME_SC_INVALID_FIELD); 3036 } 3037 last_xfer = (data_offset + len == data_len); 3038 3039 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { 3040 m_freem(m); 3041 return (NVME_SC_INVALID_FIELD); 3042 } 3043 3044 KASSERT(data_offset == CCAP(nc)->tx_data_offset, 3045 ("%s: starting data_offset %u doesn't match end of previous xfer %u", 3046 __func__, data_offset, CCAP(nc)->tx_data_offset)); 3047 3048 /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */ 3049 while (m != NULL) { 3050 struct mbuf *n; 3051 uint32_t todo; 3052 3053 if (m->m_len > qp->max_tx_data) { 3054 n = m_split(m, qp->max_tx_data, M_WAITOK); 3055 todo = m->m_len; 3056 } else { 3057 struct mbuf *p; 3058 3059 todo = m->m_len; 3060 p = m; 3061 n = p->m_next; 3062 while (n != NULL) { 3063 if (todo + n->m_len > qp->max_tx_data) { 3064 p->m_next = NULL; 3065 break; 3066 } 3067 todo += n->m_len; 3068 p = n; 3069 n = p->m_next; 3070 } 3071 MPASS(m_length(m, NULL) == todo); 3072 } 3073 3074 last_pdu = (n == NULL && last_xfer); 3075 che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, 3076 last_pdu, last_pdu && qp->send_success); 3077 3078 data_offset += todo; 3079 data_len -= todo; 3080 m = n; 3081 } 3082 MPASS(data_len == 0); 3083 3084 #ifdef INVARIANTS 3085 CCAP(nc)->tx_data_offset = data_offset; 3086 #endif 3087 if (!last_xfer) 3088 return (NVMF_MORE); 3089 else if (qp->send_success) 3090 return (NVMF_SUCCESS_SENT); 3091 else 3092 return (NVME_SC_SUCCESS); 3093 } 3094 3095 struct nvmf_transport_ops che_ops = { 3096 .allocate_qpair = che_allocate_qpair, 3097 .free_qpair = che_free_qpair, 3098 .max_ioccsz = che_max_ioccsz, 3099 .max_xfer_size = che_max_xfer_size, 3100 .allocate_capsule = che_allocate_capsule, 3101 .free_capsule = che_free_capsule, 3102 .transmit_capsule = che_transmit_capsule, 3103 .validate_command_capsule = che_validate_command_capsule, 3104 .capsule_data_len = che_capsule_data_len, 3105 .receive_controller_data = che_receive_controller_data, 3106 .send_controller_data = che_send_controller_data, 3107 .trtype = NVMF_TRTYPE_TCP, 3108 .priority = 10, 3109 }; 3110 3111 NVMF_TRANSPORT(che, che_ops); 3112 3113 static void 3114 read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len, 3115 uint32_t *max_rx_pdu_len) 3116 { 3117 uint32_t tx_len, rx_len, r, v; 3118 3119 /* Copied from cxgbei, but not sure if this is correct. */ 3120 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); 3121 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); 3122 3123 r = t4_read_reg(sc, A_TP_PARA_REG2); 3124 rx_len = min(rx_len, G_MAXRXDATA(r)); 3125 tx_len = min(tx_len, G_MAXRXDATA(r)); 3126 3127 r = t4_read_reg(sc, A_TP_PARA_REG7); 3128 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); 3129 rx_len = min(rx_len, v); 3130 tx_len = min(tx_len, v); 3131 3132 /* Cannot be larger than 32KB - 256. */ 3133 rx_len = min(rx_len, 32512); 3134 tx_len = min(tx_len, 32512); 3135 3136 *max_tx_pdu_len = tx_len; 3137 *max_rx_pdu_len = rx_len; 3138 } 3139 3140 static int 3141 nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca) 3142 { 3143 struct sysctl_oid *oid; 3144 struct sysctl_oid_list *children; 3145 uint32_t val; 3146 3147 read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu); 3148 if (nca->max_transmit_pdu > che_max_transmit_pdu) 3149 nca->max_transmit_pdu = che_max_transmit_pdu; 3150 if (nca->max_receive_pdu > che_max_receive_pdu) 3151 nca->max_receive_pdu = che_max_receive_pdu; 3152 val = t4_read_reg(sc, A_SGE_CONTROL2); 3153 nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0; 3154 3155 sysctl_ctx_init(&nca->ctx); 3156 oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */ 3157 children = SYSCTL_CHILDREN(oid); 3158 3159 oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme", 3160 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings"); 3161 children = SYSCTL_CHILDREN(oid); 3162 3163 nca->ddp_threshold = 8192; 3164 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold", 3165 CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold"); 3166 3167 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu", 3168 CTLFLAG_RW, &nca->max_transmit_pdu, 0, 3169 "Maximum size of a transmitted PDU"); 3170 3171 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu", 3172 CTLFLAG_RW, &nca->max_receive_pdu, 0, 3173 "Maximum size of a received PDU"); 3174 3175 return (0); 3176 } 3177 3178 static void 3179 nvmf_che_destroy(struct nvmf_che_adapter *nca) 3180 { 3181 sysctl_ctx_free(&nca->ctx); 3182 free(nca, M_CXGBE); 3183 } 3184 3185 static int 3186 nvmf_che_activate(struct adapter *sc) 3187 { 3188 struct nvmf_che_adapter *nca; 3189 int rc; 3190 3191 ASSERT_SYNCHRONIZED_OP(sc); 3192 3193 if (uld_active(sc, ULD_NVME)) { 3194 KASSERT(0, ("%s: NVMe offload already enabled on adapter %p", 3195 __func__, sc)); 3196 return (0); 3197 } 3198 3199 if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) { 3200 device_printf(sc->dev, 3201 "not NVMe offload capable, or capability disabled\n"); 3202 return (ENOSYS); 3203 } 3204 3205 /* per-adapter softc for NVMe */ 3206 nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK); 3207 nca->sc = sc; 3208 3209 rc = nvmf_che_init(sc, nca); 3210 if (rc != 0) { 3211 free(nca, M_CXGBE); 3212 return (rc); 3213 } 3214 3215 sc->nvme_ulp_softc = nca; 3216 3217 return (0); 3218 } 3219 3220 static int 3221 nvmf_che_deactivate(struct adapter *sc) 3222 { 3223 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; 3224 3225 ASSERT_SYNCHRONIZED_OP(sc); 3226 3227 if (nca != NULL) { 3228 nvmf_che_destroy(nca); 3229 sc->nvme_ulp_softc = NULL; 3230 } 3231 3232 return (0); 3233 } 3234 3235 static void 3236 nvmf_che_activate_all(struct adapter *sc, void *arg __unused) 3237 { 3238 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0) 3239 return; 3240 3241 /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */ 3242 if (sc->offload_map && !uld_active(sc, ULD_NVME)) 3243 (void) t4_activate_uld(sc, ULD_NVME); 3244 3245 end_synchronized_op(sc, 0); 3246 } 3247 3248 static void 3249 nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused) 3250 { 3251 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0) 3252 return; 3253 3254 if (uld_active(sc, ULD_NVME)) 3255 (void) t4_deactivate_uld(sc, ULD_NVME); 3256 3257 end_synchronized_op(sc, 0); 3258 } 3259 3260 static struct uld_info nvmf_che_uld_info = { 3261 .uld_activate = nvmf_che_activate, 3262 .uld_deactivate = nvmf_che_deactivate, 3263 }; 3264 3265 static int 3266 nvmf_che_mod_load(void) 3267 { 3268 int rc; 3269 3270 t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp); 3271 t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data); 3272 3273 rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME); 3274 if (rc != 0) 3275 return (rc); 3276 3277 t4_iterate(nvmf_che_activate_all, NULL); 3278 3279 return (rc); 3280 } 3281 3282 static int 3283 nvmf_che_mod_unload(void) 3284 { 3285 t4_iterate(nvmf_che_deactivate_all, NULL); 3286 3287 if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY) 3288 return (EBUSY); 3289 3290 t4_register_cpl_handler(CPL_NVMT_CMP, NULL); 3291 t4_register_cpl_handler(CPL_NVMT_DATA, NULL); 3292 3293 return (0); 3294 } 3295 #endif 3296 3297 static int 3298 nvmf_che_modevent(module_t mod, int cmd, void *arg) 3299 { 3300 int rc; 3301 3302 #ifdef TCP_OFFLOAD 3303 switch (cmd) { 3304 case MOD_LOAD: 3305 rc = nvmf_che_mod_load(); 3306 break; 3307 case MOD_UNLOAD: 3308 rc = nvmf_che_mod_unload(); 3309 break; 3310 default: 3311 rc = EOPNOTSUPP; 3312 break; 3313 } 3314 #else 3315 printf("nvmf_che: compiled without TCP_OFFLOAD support.\n"); 3316 rc = EOPNOTSUPP; 3317 #endif 3318 3319 return (rc); 3320 } 3321 3322 static moduledata_t nvmf_che_mod = { 3323 "nvmf_che", 3324 nvmf_che_modevent, 3325 NULL, 3326 }; 3327 3328 MODULE_VERSION(nvmf_che, 1); 3329 DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY); 3330 MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1); 3331 MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1); 3332