1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "opt_inet.h" 30 31 #include <sys/param.h> 32 #include <sys/libkern.h> 33 #include <sys/kernel.h> 34 #include <sys/module.h> 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/bitset.h> 38 #include <sys/capsicum.h> 39 #include <sys/file.h> 40 #include <sys/kthread.h> 41 #include <sys/ktr.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/nv.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <netinet/in.h> 49 #include <netinet/in_pcb.h> 50 #include <netinet/tcp_var.h> 51 #include <netinet/toecore.h> 52 53 #include <dev/nvmf/nvmf.h> 54 #include <dev/nvmf/nvmf_proto.h> 55 #include <dev/nvmf/nvmf_tcp.h> 56 #include <dev/nvmf/nvmf_transport.h> 57 #include <dev/nvmf/nvmf_transport_internal.h> 58 59 #include <vm/pmap.h> 60 #include <vm/vm_page.h> 61 62 #include "common/common.h" 63 #include "common/t4_regs.h" 64 #include "common/t4_tcb.h" 65 #include "tom/t4_tom.h" 66 67 /* Status code values in CPL_NVMT_CMP. */ 68 #define CMP_STATUS_ERROR_MASK 0x7f 69 #define CMP_STATUS_NO_ERROR 0 70 #define CMP_STATUS_HEADER_DIGEST 1 71 #define CMP_STATUS_DIRECTION_MISMATCH 2 72 #define CMP_STATUS_DIGEST_FLAG_MISMATCH 3 73 #define CMP_STATUS_SUCCESS_NOT_LAST 4 74 #define CMP_STATUS_BAD_DATA_LENGTH 5 75 #define CMP_STATUS_USER_MODE_UNALLOCATED 6 76 #define CMP_STATUS_RQT_LIMIT 7 77 #define CMP_STATUS_RQT_WRAP 8 78 #define CMP_STATUS_RQT_BOUND 9 79 #define CMP_STATUS_TPT_LIMIT 16 80 #define CMP_STATUS_TPT_INVALID 17 81 #define CMP_STATUS_TPT_COLOUR_MISMATCH 18 82 #define CMP_STATUS_TPT_MISC 19 83 #define CMP_STATUS_TPT_WRAP 20 84 #define CMP_STATUS_TPT_BOUND 21 85 #define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22 86 #define CMP_STATUS_PBL_LIMIT 24 87 #define CMP_STATUS_DATA_DIGEST 25 88 #define CMP_STATUS_DDP 0x80 89 90 /* 91 * Transfer tags and CIDs with the MSB set are "unallocated" tags that 92 * pass data through to the freelist without using DDP. 93 */ 94 #define CHE_FL_TAG_MASK 0x8000 95 #define CHE_MAX_FL_TAG 0x7fff 96 #define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1) 97 98 #define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK) 99 #define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK) 100 #define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color)) 101 #define CHE_STAG_COLOR(stag) ((stag) & 0xf) 102 #define CHE_STAG_IDX(stag) ((stag) >> 4) 103 #define CHE_DDP_MAX_COLOR 0xf 104 105 #define CHE_DDP_NO_TAG 0xffff 106 107 /* 108 * A bitmap of non-DDP CIDs in use on the host. Since there is no 109 * _BIT_FFC (find first clear), the bitset is inverted so that a clear 110 * bit indicates an in-use CID. 111 */ 112 BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS); 113 #define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p) 114 #define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p) 115 #define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p) 116 #define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p) 117 #define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start) 118 119 /* 120 * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP 121 * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus. 122 */ 123 #define nvmf_tcp_seq PH_loc.thirtytwo[0] 124 125 /* 126 * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf 127 * is in qp->rx_pdus. 128 */ 129 #define nvmf_cpl_status PH_loc.eight[4] 130 131 struct nvmf_che_capsule; 132 struct nvmf_che_qpair; 133 134 struct nvmf_che_adapter { 135 struct adapter *sc; 136 137 u_int ddp_threshold; 138 u_int max_transmit_pdu; 139 u_int max_receive_pdu; 140 bool nvmt_data_iqe; 141 142 struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */ 143 }; 144 145 struct nvmf_che_command_buffer { 146 struct nvmf_che_qpair *qp; 147 148 struct nvmf_io_request io; 149 size_t data_len; 150 size_t data_xfered; 151 uint32_t data_offset; 152 153 u_int refs; 154 int error; 155 156 bool ddp_ok; 157 uint16_t cid; 158 uint16_t ttag; 159 uint16_t original_cid; /* Host only */ 160 161 TAILQ_ENTRY(nvmf_che_command_buffer) link; 162 163 /* Fields used for DDP. */ 164 struct fw_ri_tpte tpte; 165 uint64_t *pbl; 166 uint32_t pbl_addr; 167 uint32_t pbl_len; 168 169 /* Controller only */ 170 struct nvmf_che_capsule *cc; 171 }; 172 173 struct nvmf_che_command_buffer_list { 174 TAILQ_HEAD(, nvmf_che_command_buffer) head; 175 struct mtx lock; 176 }; 177 178 struct nvmf_che_qpair { 179 struct nvmf_qpair qp; 180 181 struct socket *so; 182 struct toepcb *toep; 183 struct nvmf_che_adapter *nca; 184 185 volatile u_int refs; /* Every allocated capsule holds a reference */ 186 uint8_t txpda; 187 uint8_t rxpda; 188 bool header_digests; 189 bool data_digests; 190 uint32_t maxr2t; 191 uint32_t maxh2cdata; /* Controller only */ 192 uint32_t max_rx_data; 193 uint32_t max_tx_data; 194 uint32_t max_icd; /* Host only */ 195 uint32_t max_ioccsz; /* Controller only */ 196 union { 197 uint16_t next_fl_ttag; /* Controller only */ 198 uint16_t next_cid; /* Host only */ 199 }; 200 uint16_t next_ddp_tag; 201 u_int num_fl_ttags; /* Controller only */ 202 u_int active_fl_ttags; /* Controller only */ 203 u_int num_ddp_tags; 204 u_int active_ddp_tags; 205 bool send_success; /* Controller only */ 206 uint8_t ddp_color; 207 uint32_t tpt_offset; 208 209 /* Receive state. */ 210 struct thread *rx_thread; 211 struct cv rx_cv; 212 bool rx_shutdown; 213 int rx_error; 214 struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */ 215 struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */ 216 217 /* Transmit state. */ 218 struct thread *tx_thread; 219 struct cv tx_cv; 220 bool tx_shutdown; 221 STAILQ_HEAD(, nvmf_che_capsule) tx_capsules; 222 223 struct nvmf_che_command_buffer_list tx_buffers; 224 struct nvmf_che_command_buffer_list rx_buffers; 225 226 /* 227 * For the controller, an RX command buffer can be in one of 228 * three locations, all protected by the rx_buffers.lock. If 229 * a receive request is waiting for either an R2T slot for its 230 * command (due to exceeding MAXR2T), or a transfer tag it is 231 * placed on the rx_buffers list. When a request is allocated 232 * an active transfer tag, it moves to either the 233 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the 234 * tag) until it completes. 235 * 236 * For the host, an RX command buffer using DDP is in 237 * open_ddp_tags[], otherwise it is in rx_buffers. 238 */ 239 struct nvmf_che_command_buffer **open_ddp_tags; 240 struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */ 241 242 /* 243 * For the host, CIDs submitted by nvmf(4) must be rewritten 244 * to either use DDP or not use DDP. The CID in response 245 * capsules must be restored to their original value. For 246 * DDP, the original CID is stored in the command buffer. 247 * These variables manage non-DDP CIDs. 248 */ 249 uint16_t *fl_cids; /* Host only */ 250 struct fl_cid_set *fl_cid_set; /* Host only */ 251 struct mtx fl_cid_lock; /* Host only */ 252 }; 253 254 struct nvmf_che_rxpdu { 255 struct mbuf *m; 256 const struct nvme_tcp_common_pdu_hdr *hdr; 257 uint32_t data_len; 258 bool data_digest_mismatch; 259 bool ddp; 260 }; 261 262 struct nvmf_che_capsule { 263 struct nvmf_capsule nc; 264 265 volatile u_int refs; 266 267 struct nvmf_che_rxpdu rx_pdu; 268 269 uint32_t active_r2ts; /* Controller only */ 270 #ifdef INVARIANTS 271 uint32_t tx_data_offset; /* Controller only */ 272 u_int pending_r2ts; /* Controller only */ 273 #endif 274 275 STAILQ_ENTRY(nvmf_che_capsule) link; 276 }; 277 278 #define CCAP(nc) ((struct nvmf_che_capsule *)(nc)) 279 #define CQP(qp) ((struct nvmf_che_qpair *)(qp)) 280 281 static void che_release_capsule(struct nvmf_che_capsule *cc); 282 static void che_free_qpair(struct nvmf_qpair *nq); 283 284 SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 285 "Chelsio TCP offload transport"); 286 287 static u_int che_max_transmit_pdu = 32 * 1024; 288 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN, 289 &che_max_transmit_pdu, 0, 290 "Maximum size of a transmitted PDU"); 291 292 static u_int che_max_receive_pdu = 32 * 1024; 293 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN, 294 &che_max_receive_pdu, 0, 295 "Maximum size of a received PDU"); 296 297 static int use_dsgl = 1; 298 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0, 299 "Use DSGL for PBL/FastReg (default=1)"); 300 301 static int inline_threshold = 256; 302 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN, 303 &inline_threshold, 0, 304 "inline vs dsgl threshold (default=256)"); 305 306 static int ddp_tags_per_qp = 128; 307 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN, 308 &ddp_tags_per_qp, 0, 309 "Number of DDP tags to reserve for each queue pair"); 310 311 static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload"); 312 313 /* 314 * PBL regions consist of N full-sized pages. TPT entries support an 315 * initial offset into the first page (FBO) and can handle a partial 316 * length on the last page. 317 */ 318 static bool 319 che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io) 320 { 321 const struct memdesc *mem = &io->io_mem; 322 struct bus_dma_segment *ds; 323 int i; 324 325 if (io->io_len < qp->nca->ddp_threshold) { 326 return (false); 327 } 328 329 switch (mem->md_type) { 330 case MEMDESC_VADDR: 331 case MEMDESC_PADDR: 332 case MEMDESC_VMPAGES: 333 return (true); 334 case MEMDESC_VLIST: 335 case MEMDESC_PLIST: 336 /* 337 * Require all but the first segment to start on a 338 * page boundary. Require all but the last segment to 339 * end on a page boundary. 340 */ 341 ds = mem->u.md_list; 342 for (i = 0; i < mem->md_nseg; i++, ds++) { 343 if (i != 0 && ds->ds_addr % PAGE_SIZE != 0) 344 return (false); 345 if (i != mem->md_nseg - 1 && 346 (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0) 347 return (false); 348 } 349 return (true); 350 default: 351 /* 352 * Other types could be validated with more work, but 353 * they aren't used currently by nvmf(4) or nvmft(4). 354 */ 355 return (false); 356 } 357 } 358 359 static u_int 360 che_fbo(struct nvmf_che_command_buffer *cb) 361 { 362 struct memdesc *mem = &cb->io.io_mem; 363 364 switch (mem->md_type) { 365 case MEMDESC_VADDR: 366 return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK); 367 case MEMDESC_PADDR: 368 return (mem->u.md_paddr & PAGE_MASK); 369 case MEMDESC_VMPAGES: 370 return (mem->md_offset); 371 case MEMDESC_VLIST: 372 case MEMDESC_PLIST: 373 return (mem->u.md_list[0].ds_addr & PAGE_MASK); 374 default: 375 __assert_unreachable(); 376 } 377 } 378 379 static u_int 380 che_npages(struct nvmf_che_command_buffer *cb) 381 { 382 return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE)); 383 } 384 385 static struct nvmf_che_command_buffer * 386 che_alloc_command_buffer(struct nvmf_che_qpair *qp, 387 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, 388 uint16_t cid) 389 { 390 struct nvmf_che_command_buffer *cb; 391 392 cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK); 393 cb->qp = qp; 394 cb->io = *io; 395 cb->data_offset = data_offset; 396 cb->data_len = data_len; 397 cb->data_xfered = 0; 398 refcount_init(&cb->refs, 1); 399 cb->error = 0; 400 cb->ddp_ok = che_ddp_io_check(qp, io); 401 cb->cid = cid; 402 cb->ttag = 0; 403 cb->original_cid = 0; 404 cb->cc = NULL; 405 cb->pbl = NULL; 406 407 return (cb); 408 } 409 410 static void 411 che_hold_command_buffer(struct nvmf_che_command_buffer *cb) 412 { 413 refcount_acquire(&cb->refs); 414 } 415 416 static void 417 che_free_command_buffer(struct nvmf_che_command_buffer *cb) 418 { 419 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); 420 if (cb->cc != NULL) 421 che_release_capsule(cb->cc); 422 MPASS(cb->pbl == NULL); 423 free(cb, M_NVMF_CHE); 424 } 425 426 static void 427 che_release_command_buffer(struct nvmf_che_command_buffer *cb) 428 { 429 if (refcount_release(&cb->refs)) 430 che_free_command_buffer(cb); 431 } 432 433 static void 434 che_add_command_buffer(struct nvmf_che_command_buffer_list *list, 435 struct nvmf_che_command_buffer *cb) 436 { 437 mtx_assert(&list->lock, MA_OWNED); 438 TAILQ_INSERT_HEAD(&list->head, cb, link); 439 } 440 441 static struct nvmf_che_command_buffer * 442 che_find_command_buffer(struct nvmf_che_command_buffer_list *list, 443 uint16_t cid) 444 { 445 struct nvmf_che_command_buffer *cb; 446 447 mtx_assert(&list->lock, MA_OWNED); 448 TAILQ_FOREACH(cb, &list->head, link) { 449 if (cb->cid == cid) 450 return (cb); 451 } 452 return (NULL); 453 } 454 455 static void 456 che_remove_command_buffer(struct nvmf_che_command_buffer_list *list, 457 struct nvmf_che_command_buffer *cb) 458 { 459 mtx_assert(&list->lock, MA_OWNED); 460 TAILQ_REMOVE(&list->head, cb, link); 461 } 462 463 static void 464 che_purge_command_buffer(struct nvmf_che_command_buffer_list *list, 465 uint16_t cid) 466 { 467 struct nvmf_che_command_buffer *cb; 468 469 mtx_lock(&list->lock); 470 cb = che_find_command_buffer(list, cid); 471 if (cb != NULL) { 472 che_remove_command_buffer(list, cb); 473 mtx_unlock(&list->lock); 474 che_release_command_buffer(cb); 475 } else 476 mtx_unlock(&list->lock); 477 } 478 479 static int 480 che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr, 481 uint32_t len, void *data, struct mbufq *wrq) 482 { 483 struct mbuf *m; 484 char *cp; 485 int copy_len, i, num_wqe, wr_len; 486 487 #ifdef VERBOSE_TRACES 488 CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len); 489 #endif 490 num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE); 491 cp = data; 492 for (i = 0; i < num_wqe; i++) { 493 copy_len = min(len, T4_MAX_INLINE_SIZE); 494 wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len); 495 496 m = alloc_raw_wr_mbuf(wr_len); 497 if (m == NULL) 498 return (ENOMEM); 499 t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid, 500 addr, copy_len, cp, 0); 501 if (cp != NULL) 502 cp += T4_MAX_INLINE_SIZE; 503 addr += T4_MAX_INLINE_SIZE >> 5; 504 len -= T4_MAX_INLINE_SIZE; 505 506 mbufq_enqueue(wrq, m); 507 } 508 return (0); 509 } 510 511 static int 512 che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep, 513 uint32_t addr, uint32_t len, void *data, struct mbufq *wrq) 514 { 515 struct mbuf *m; 516 vm_offset_t va; 517 u_int todo; 518 int wr_len; 519 520 /* First page. */ 521 va = (vm_offset_t)data; 522 todo = min(PAGE_SIZE - (va % PAGE_SIZE), len); 523 wr_len = T4_WRITE_MEM_DMA_LEN; 524 m = alloc_raw_wr_mbuf(wr_len); 525 if (m == NULL) 526 return (ENOMEM); 527 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr, 528 todo, pmap_kextract(va), 0); 529 mbufq_enqueue(wrq, m); 530 len -= todo; 531 addr += todo >> 5; 532 va += todo; 533 534 while (len > 0) { 535 MPASS(va == trunc_page(va)); 536 todo = min(PAGE_SIZE, len); 537 m = alloc_raw_wr_mbuf(wr_len); 538 if (m == NULL) 539 return (ENOMEM); 540 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, 541 addr, todo, pmap_kextract(va), 0); 542 mbufq_enqueue(wrq, m); 543 len -= todo; 544 addr += todo >> 5; 545 va += todo; 546 } 547 return (0); 548 } 549 550 static int 551 che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len, 552 void *data) 553 { 554 struct adapter *sc = qp->nca->sc; 555 struct toepcb *toep = qp->toep; 556 struct socket *so = qp->so; 557 struct inpcb *inp = sotoinpcb(so); 558 struct tcpcb *tp = intotcpcb(inp); 559 struct mbufq mq; 560 int error; 561 562 mbufq_init(&mq, INT_MAX); 563 if (!use_dsgl || len < inline_threshold || data == NULL) 564 error = che_write_mem_inline(sc, toep, addr, len, data, &mq); 565 else 566 error = che_write_mem_dma_aligned(sc, toep, addr, len, data, 567 &mq); 568 if (__predict_false(error != 0)) 569 goto error; 570 571 INP_WLOCK(inp); 572 if ((tp->t_flags & TF_DISCONNECTED) != 0) { 573 INP_WUNLOCK(inp); 574 error = ECONNRESET; 575 goto error; 576 } 577 mbufq_concat(&toep->ulp_pduq, &mq); 578 INP_WUNLOCK(inp); 579 return (0); 580 581 error: 582 mbufq_drain(&mq); 583 return (error); 584 } 585 586 static bool 587 che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) 588 { 589 struct adapter *sc = qp->nca->sc; 590 struct memdesc *mem = &cb->io.io_mem; 591 uint64_t *pbl; 592 uint32_t addr, len; 593 u_int i, npages; 594 int error; 595 596 MPASS(cb->pbl == NULL); 597 MPASS(cb->ddp_ok); 598 599 /* Hardware limit? iWARP only enforces this for T5. */ 600 if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL)) 601 return (false); 602 603 npages = che_npages(cb); 604 len = roundup2(npages, 4) * sizeof(*cb->pbl); 605 addr = t4_pblpool_alloc(sc, len); 606 if (addr == 0) 607 return (false); 608 609 pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO); 610 if (pbl == NULL) { 611 t4_pblpool_free(sc, addr, len); 612 return (false); 613 } 614 615 switch (mem->md_type) { 616 case MEMDESC_VADDR: 617 { 618 vm_offset_t va; 619 620 va = trunc_page((uintptr_t)mem->u.md_vaddr); 621 for (i = 0; i < npages; i++) 622 pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE)); 623 break; 624 } 625 case MEMDESC_PADDR: 626 { 627 vm_paddr_t pa; 628 629 pa = trunc_page(mem->u.md_paddr); 630 for (i = 0; i < npages; i++) 631 pbl[i] = htobe64(pa + i * PAGE_SIZE); 632 break; 633 } 634 case MEMDESC_VMPAGES: 635 for (i = 0; i < npages; i++) 636 pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i])); 637 break; 638 case MEMDESC_VLIST: 639 { 640 struct bus_dma_segment *ds; 641 vm_offset_t va; 642 vm_size_t len; 643 u_int j, k; 644 645 i = 0; 646 ds = mem->u.md_list; 647 for (j = 0; j < mem->md_nseg; j++, ds++) { 648 va = trunc_page((uintptr_t)ds->ds_addr); 649 len = ds->ds_len; 650 if (ds->ds_addr % PAGE_SIZE != 0) 651 len += ds->ds_addr % PAGE_SIZE; 652 for (k = 0; k < howmany(len, PAGE_SIZE); k++) { 653 pbl[i] = htobe64(pmap_kextract(va + 654 k * PAGE_SIZE)); 655 i++; 656 } 657 } 658 MPASS(i == npages); 659 break; 660 } 661 case MEMDESC_PLIST: 662 { 663 struct bus_dma_segment *ds; 664 vm_paddr_t pa; 665 vm_size_t len; 666 u_int j, k; 667 668 i = 0; 669 ds = mem->u.md_list; 670 for (j = 0; j < mem->md_nseg; j++, ds++) { 671 pa = trunc_page((vm_paddr_t)ds->ds_addr); 672 len = ds->ds_len; 673 if (ds->ds_addr % PAGE_SIZE != 0) 674 len += ds->ds_addr % PAGE_SIZE; 675 for (k = 0; k < howmany(len, PAGE_SIZE); k++) { 676 pbl[i] = htobe64(pa + k * PAGE_SIZE); 677 i++; 678 } 679 } 680 MPASS(i == npages); 681 break; 682 } 683 default: 684 __assert_unreachable(); 685 } 686 687 error = che_write_adapter_mem(qp, addr >> 5, len, pbl); 688 if (error != 0) { 689 t4_pblpool_free(sc, addr, len); 690 free(pbl, M_NVMF_CHE); 691 return (false); 692 } 693 694 cb->pbl = pbl; 695 cb->pbl_addr = addr; 696 cb->pbl_len = len; 697 698 return (true); 699 } 700 701 static void 702 che_free_pbl(struct nvmf_che_command_buffer *cb) 703 { 704 free(cb->pbl, M_NVMF_CHE); 705 t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len); 706 cb->pbl = NULL; 707 cb->pbl_addr = 0; 708 cb->pbl_len = 0; 709 } 710 711 static bool 712 che_write_tpt_entry(struct nvmf_che_qpair *qp, 713 struct nvmf_che_command_buffer *cb, uint16_t stag) 714 { 715 uint32_t tpt_addr; 716 int error; 717 718 cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID | 719 V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) | 720 F_FW_RI_TPTE_STAGSTATE | 721 V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) | 722 V_FW_RI_TPTE_PDID(0)); 723 cb->tpte.locread_to_qpid = htobe32( 724 V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) | 725 V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) | 726 V_FW_RI_TPTE_PS(PAGE_SIZE) | 727 V_FW_RI_TPTE_QPID(qp->toep->tid)); 728 #define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start) 729 cb->tpte.nosnoop_pbladdr = 730 htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3)); 731 cb->tpte.len_lo = htobe32(cb->data_len); 732 cb->tpte.va_hi = 0; 733 cb->tpte.va_lo_fbo = htobe32(che_fbo(cb)); 734 cb->tpte.dca_mwbcnt_pstag = 0; 735 cb->tpte.len_hi = htobe32(cb->data_offset); 736 737 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + 738 (qp->nca->sc->vres.stag.start >> 5); 739 740 error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte), 741 &cb->tpte); 742 return (error == 0); 743 } 744 745 static void 746 che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag) 747 { 748 uint32_t tpt_addr; 749 750 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + 751 (qp->nca->sc->vres.stag.start >> 5); 752 753 (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte), 754 NULL); 755 } 756 757 static uint16_t 758 che_alloc_ddp_stag(struct nvmf_che_qpair *qp, 759 struct nvmf_che_command_buffer *cb) 760 { 761 uint16_t stag_idx; 762 763 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 764 MPASS(cb->ddp_ok); 765 766 if (qp->active_ddp_tags == qp->num_ddp_tags) 767 return (CHE_DDP_NO_TAG); 768 769 MPASS(qp->num_ddp_tags != 0); 770 771 stag_idx = qp->next_ddp_tag; 772 for (;;) { 773 if (qp->open_ddp_tags[stag_idx] == NULL) 774 break; 775 if (stag_idx == qp->num_ddp_tags - 1) { 776 stag_idx = 0; 777 if (qp->ddp_color == CHE_DDP_MAX_COLOR) 778 qp->ddp_color = 0; 779 else 780 qp->ddp_color++; 781 } else 782 stag_idx++; 783 MPASS(stag_idx != qp->next_ddp_tag); 784 } 785 if (stag_idx == qp->num_ddp_tags - 1) 786 qp->next_ddp_tag = 0; 787 else 788 qp->next_ddp_tag = stag_idx + 1; 789 790 qp->active_ddp_tags++; 791 qp->open_ddp_tags[stag_idx] = cb; 792 793 return (CHE_DDP_TAG(stag_idx, qp->ddp_color)); 794 } 795 796 static void 797 che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, 798 uint16_t stag) 799 { 800 MPASS(!CHE_TAG_IS_FL(stag)); 801 802 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 803 804 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); 805 806 qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL; 807 qp->active_ddp_tags--; 808 } 809 810 static uint16_t 811 che_alloc_ddp_tag(struct nvmf_che_qpair *qp, 812 struct nvmf_che_command_buffer *cb) 813 { 814 uint16_t stag; 815 816 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 817 818 if (!cb->ddp_ok) 819 return (CHE_DDP_NO_TAG); 820 821 stag = che_alloc_ddp_stag(qp, cb); 822 if (stag == CHE_DDP_NO_TAG) { 823 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag, 824 1); 825 return (CHE_DDP_NO_TAG); 826 } 827 828 if (!che_alloc_pbl(qp, cb)) { 829 che_free_ddp_stag(qp, cb, stag); 830 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); 831 return (CHE_DDP_NO_TAG); 832 } 833 834 if (!che_write_tpt_entry(qp, cb, stag)) { 835 che_free_pbl(cb); 836 che_free_ddp_stag(qp, cb, stag); 837 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); 838 return (CHE_DDP_NO_TAG); 839 } 840 841 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1); 842 return (stag); 843 } 844 845 static void 846 che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, 847 uint16_t stag) 848 { 849 MPASS(!CHE_TAG_IS_FL(stag)); 850 851 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 852 853 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); 854 855 che_clear_tpt_entry(qp, stag); 856 che_free_pbl(cb); 857 che_free_ddp_stag(qp, cb, stag); 858 } 859 860 static void 861 nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m) 862 { 863 struct epoch_tracker et; 864 struct socket *so = qp->so; 865 struct inpcb *inp = sotoinpcb(so); 866 struct tcpcb *tp = intotcpcb(inp); 867 struct toepcb *toep = qp->toep; 868 869 CURVNET_SET(so->so_vnet); 870 NET_EPOCH_ENTER(et); 871 INP_WLOCK(inp); 872 if (__predict_false(tp->t_flags & TF_DISCONNECTED) || 873 __predict_false((toep->flags & TPF_ATTACHED) == 0)) { 874 m_freem(m); 875 } else { 876 mbufq_enqueue(&toep->ulp_pduq, m); 877 t4_push_pdus(toep->vi->adapter, toep, 0); 878 } 879 INP_WUNLOCK(inp); 880 NET_EPOCH_EXIT(et); 881 CURVNET_RESTORE(); 882 } 883 884 static void 885 nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei, 886 struct mbuf *rx_pdu, u_int hlen) 887 { 888 struct nvme_tcp_term_req_hdr *hdr; 889 struct mbuf *m; 890 891 if (hlen != 0) { 892 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); 893 hlen = min(hlen, m_length(rx_pdu, NULL)); 894 } 895 896 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR); 897 m->m_len = sizeof(*hdr) + hlen; 898 m->m_pkthdr.len = m->m_len; 899 hdr = mtod(m, void *); 900 memset(hdr, 0, sizeof(*hdr)); 901 hdr->common.pdu_type = qp->qp.nq_controller ? 902 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; 903 hdr->common.hlen = sizeof(*hdr); 904 hdr->common.plen = sizeof(*hdr) + hlen; 905 hdr->fes = htole16(fes); 906 le32enc(hdr->fei, fei); 907 if (hlen != 0) 908 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); 909 910 nvmf_che_write_pdu(qp, m); 911 } 912 913 static int 914 nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 915 { 916 const struct nvme_tcp_common_pdu_hdr *ch; 917 struct mbuf *m = pdu->m; 918 uint32_t data_len, fei, plen, rx_digest; 919 u_int hlen, cpl_error; 920 int error; 921 uint16_t fes; 922 923 /* Determine how large of a PDU header to return for errors. */ 924 ch = pdu->hdr; 925 hlen = ch->hlen; 926 plen = le32toh(ch->plen); 927 if (hlen < sizeof(*ch) || hlen > plen) 928 hlen = sizeof(*ch); 929 930 cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK; 931 switch (cpl_error) { 932 case CMP_STATUS_NO_ERROR: 933 break; 934 case CMP_STATUS_HEADER_DIGEST: 935 counter_u64_add( 936 qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1); 937 printf("NVMe/TCP: Header digest mismatch\n"); 938 rx_digest = le32dec(mtodo(m, ch->hlen)); 939 nvmf_che_report_error(qp, 940 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, 941 hlen); 942 return (EBADMSG); 943 case CMP_STATUS_DIRECTION_MISMATCH: 944 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 945 printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type); 946 nvmf_che_report_error(qp, 947 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 948 offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m, 949 hlen); 950 return (EBADMSG); 951 case CMP_STATUS_SUCCESS_NOT_LAST: 952 case CMP_STATUS_DIGEST_FLAG_MISMATCH: 953 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 954 printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags); 955 nvmf_che_report_error(qp, 956 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 957 offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen); 958 return (EBADMSG); 959 case CMP_STATUS_BAD_DATA_LENGTH: 960 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 961 printf("NVMe/TCP: Invalid PDU length %u\n", plen); 962 nvmf_che_report_error(qp, 963 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 964 offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen); 965 return (EBADMSG); 966 case CMP_STATUS_USER_MODE_UNALLOCATED: 967 case CMP_STATUS_RQT_LIMIT: 968 case CMP_STATUS_RQT_WRAP: 969 case CMP_STATUS_RQT_BOUND: 970 device_printf(qp->nca->sc->dev, 971 "received invalid NVMET error %u\n", 972 cpl_error); 973 return (ECONNRESET); 974 case CMP_STATUS_TPT_LIMIT: 975 case CMP_STATUS_TPT_INVALID: 976 case CMP_STATUS_TPT_COLOUR_MISMATCH: 977 case CMP_STATUS_TPT_MISC: 978 case CMP_STATUS_TPT_WRAP: 979 case CMP_STATUS_TPT_BOUND: 980 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 981 switch (ch->pdu_type) { 982 case NVME_TCP_PDU_TYPE_H2C_DATA: 983 nvmf_che_report_error(qp, 984 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 985 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), 986 pdu->m, pdu->hdr->hlen); 987 return (EBADMSG); 988 case NVME_TCP_PDU_TYPE_C2H_DATA: 989 nvmf_che_report_error(qp, 990 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 991 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m, 992 hlen); 993 return (EBADMSG); 994 default: 995 device_printf(qp->nca->sc->dev, 996 "received DDP NVMET error %u for PDU %u\n", 997 cpl_error, ch->pdu_type); 998 return (ECONNRESET); 999 } 1000 case CMP_STATUS_TPT_LAST_PDU_UNALIGNED: 1001 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 1002 nvmf_che_report_error(qp, 1003 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen); 1004 return (EBADMSG); 1005 case CMP_STATUS_PBL_LIMIT: 1006 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); 1007 nvmf_che_report_error(qp, 1008 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m, 1009 hlen); 1010 return (EBADMSG); 1011 case CMP_STATUS_DATA_DIGEST: 1012 /* Handled below. */ 1013 break; 1014 default: 1015 device_printf(qp->nca->sc->dev, 1016 "received unknown NVMET error %u\n", 1017 cpl_error); 1018 return (ECONNRESET); 1019 } 1020 1021 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, 1022 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, 1023 &fei); 1024 if (error != 0) { 1025 if (error != ECONNRESET) 1026 nvmf_che_report_error(qp, fes, fei, m, hlen); 1027 return (error); 1028 } 1029 1030 /* Check data digest if present. */ 1031 pdu->data_digest_mismatch = false; 1032 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { 1033 if (cpl_error == CMP_STATUS_DATA_DIGEST) { 1034 printf("NVMe/TCP: Data digest mismatch\n"); 1035 pdu->data_digest_mismatch = true; 1036 counter_u64_add( 1037 qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1); 1038 } 1039 } 1040 1041 pdu->data_len = data_len; 1042 1043 return (0); 1044 } 1045 1046 static void 1047 nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu) 1048 { 1049 m_freem(pdu->m); 1050 pdu->m = NULL; 1051 pdu->hdr = NULL; 1052 } 1053 1054 static int 1055 nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu) 1056 { 1057 const struct nvme_tcp_term_req_hdr *hdr; 1058 1059 hdr = (const void *)pdu->hdr; 1060 1061 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", 1062 le16toh(hdr->fes), le32dec(hdr->fei)); 1063 nvmf_che_free_pdu(pdu); 1064 return (ECONNRESET); 1065 } 1066 1067 static int 1068 nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp, 1069 struct nvmf_che_rxpdu *pdu) 1070 { 1071 const struct nvme_tcp_cmd *cmd; 1072 struct nvmf_capsule *nc; 1073 struct nvmf_che_capsule *cc; 1074 1075 cmd = (const void *)pdu->hdr; 1076 1077 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); 1078 1079 cc = CCAP(nc); 1080 cc->rx_pdu = *pdu; 1081 1082 nvmf_capsule_received(&qp->qp, nc); 1083 return (0); 1084 } 1085 1086 static int 1087 nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp, 1088 struct nvmf_che_rxpdu *pdu) 1089 { 1090 const struct nvme_tcp_rsp *rsp; 1091 struct nvme_completion cpl; 1092 struct nvmf_capsule *nc; 1093 struct nvmf_che_capsule *cc; 1094 uint16_t cid; 1095 1096 rsp = (const void *)pdu->hdr; 1097 1098 /* 1099 * Restore the original CID and ensure any command buffers 1100 * associated with this CID have been released. Once the CQE 1101 * has been received, no further transfers to the command 1102 * buffer for the associated CID can occur. 1103 */ 1104 cpl = rsp->rccqe; 1105 cid = le16toh(cpl.cid); 1106 if (CHE_TAG_IS_FL(cid)) { 1107 cid = CHE_RAW_FL_TAG(cid); 1108 mtx_lock(&qp->fl_cid_lock); 1109 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); 1110 cpl.cid = qp->fl_cids[cid]; 1111 FL_CID_FREE(cid, qp->fl_cid_set); 1112 mtx_unlock(&qp->fl_cid_lock); 1113 1114 che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid); 1115 che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid); 1116 } else { 1117 struct nvmf_che_command_buffer *cb; 1118 1119 mtx_lock(&qp->rx_buffers.lock); 1120 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; 1121 MPASS(cb != NULL); 1122 MPASS(cb->cid == rsp->rccqe.cid); 1123 cpl.cid = cb->original_cid; 1124 che_free_ddp_tag(qp, cb, cid); 1125 mtx_unlock(&qp->rx_buffers.lock); 1126 che_release_command_buffer(cb); 1127 } 1128 #ifdef VERBOSE_TRACES 1129 CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__, 1130 qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid); 1131 #endif 1132 1133 nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK); 1134 1135 nc->nc_sqhd_valid = true; 1136 cc = CCAP(nc); 1137 cc->rx_pdu = *pdu; 1138 1139 nvmf_capsule_received(&qp->qp, nc); 1140 return (0); 1141 } 1142 1143 /* 1144 * Construct a PDU that contains an optional data payload. This 1145 * includes dealing with the length fields in the common header. The 1146 * adapter inserts digests and padding when the PDU is transmitted. 1147 */ 1148 static struct mbuf * 1149 nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen, 1150 struct mbuf *data, uint32_t data_len) 1151 { 1152 struct nvme_tcp_common_pdu_hdr *ch; 1153 struct mbuf *top; 1154 uint32_t pdo, plen; 1155 uint8_t ulp_submode; 1156 1157 plen = hlen; 1158 if (qp->header_digests) 1159 plen += sizeof(uint32_t); 1160 if (data_len != 0) { 1161 KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); 1162 pdo = roundup(plen, qp->txpda); 1163 plen = pdo + data_len; 1164 if (qp->data_digests) 1165 plen += sizeof(uint32_t); 1166 } else { 1167 KASSERT(data == NULL, ("payload mbuf with zero length")); 1168 pdo = 0; 1169 } 1170 1171 top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR); 1172 top->m_len = hlen; 1173 top->m_pkthdr.len = hlen; 1174 ch = mtod(top, void *); 1175 memcpy(ch, hdr, hlen); 1176 ch->hlen = hlen; 1177 ulp_submode = 0; 1178 if (qp->header_digests) { 1179 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; 1180 ulp_submode |= ULP_CRC_HEADER; 1181 } 1182 if (qp->data_digests && data_len != 0) { 1183 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; 1184 ulp_submode |= ULP_CRC_DATA; 1185 } 1186 ch->pdo = pdo; 1187 ch->plen = htole32(plen); 1188 set_mbuf_ulp_submode(top, ulp_submode); 1189 1190 if (data_len != 0) { 1191 top->m_pkthdr.len += data_len; 1192 top->m_next = data; 1193 } 1194 1195 return (top); 1196 } 1197 1198 /* Allocate the next free freelist transfer tag. */ 1199 static bool 1200 nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp, 1201 struct nvmf_che_command_buffer *cb) 1202 { 1203 uint16_t ttag; 1204 1205 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1206 1207 if (qp->active_fl_ttags == qp->num_fl_ttags) 1208 return (false); 1209 1210 ttag = qp->next_fl_ttag; 1211 for (;;) { 1212 if (qp->open_fl_ttags[ttag] == NULL) 1213 break; 1214 if (ttag == qp->num_fl_ttags - 1) 1215 ttag = 0; 1216 else 1217 ttag++; 1218 MPASS(ttag != qp->next_fl_ttag); 1219 } 1220 if (ttag == qp->num_fl_ttags - 1) 1221 qp->next_fl_ttag = 0; 1222 else 1223 qp->next_fl_ttag = ttag + 1; 1224 1225 qp->active_fl_ttags++; 1226 qp->open_fl_ttags[ttag] = cb; 1227 1228 cb->ttag = ttag | CHE_FL_TAG_MASK; 1229 return (true); 1230 } 1231 1232 /* Attempt to allocate a free transfer tag and assign it to cb. */ 1233 static bool 1234 nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp, 1235 struct nvmf_che_command_buffer *cb) 1236 { 1237 uint16_t stag; 1238 1239 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1240 1241 stag = che_alloc_ddp_tag(qp, cb); 1242 if (stag == CHE_DDP_NO_TAG) { 1243 if (!nvmf_che_allocate_fl_ttag(qp, cb)) 1244 return (false); 1245 } else { 1246 cb->ttag = stag; 1247 } 1248 #ifdef VERBOSE_TRACES 1249 CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__, 1250 qp->toep->tid, cb->ttag); 1251 #endif 1252 cb->cc->active_r2ts++; 1253 return (true); 1254 } 1255 1256 /* Find the next command buffer eligible to schedule for R2T. */ 1257 static struct nvmf_che_command_buffer * 1258 nvmf_che_next_r2t(struct nvmf_che_qpair *qp) 1259 { 1260 struct nvmf_che_command_buffer *cb; 1261 1262 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1263 1264 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { 1265 /* NB: maxr2t is 0's based. */ 1266 if (cb->cc->active_r2ts > qp->maxr2t) 1267 continue; 1268 1269 if (!nvmf_che_allocate_ttag(qp, cb)) 1270 return (NULL); 1271 #ifdef INVARIANTS 1272 cb->cc->pending_r2ts--; 1273 #endif 1274 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); 1275 return (cb); 1276 } 1277 return (NULL); 1278 } 1279 1280 /* NB: cid and is little-endian already. */ 1281 static void 1282 che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, 1283 uint32_t data_offset, uint32_t data_len) 1284 { 1285 struct nvme_tcp_r2t_hdr r2t; 1286 struct mbuf *m; 1287 1288 memset(&r2t, 0, sizeof(r2t)); 1289 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; 1290 r2t.cccid = cid; 1291 r2t.ttag = htole16(ttag); 1292 r2t.r2to = htole32(data_offset); 1293 r2t.r2tl = htole32(data_len); 1294 1295 m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); 1296 nvmf_che_write_pdu(qp, m); 1297 } 1298 1299 /* 1300 * Release a transfer tag and schedule another R2T. 1301 * 1302 * NB: This drops the rx_buffers.lock mutex. 1303 */ 1304 static void 1305 nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp, 1306 struct nvmf_che_command_buffer *cb) 1307 { 1308 struct nvmf_che_command_buffer *ncb; 1309 1310 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 1311 1312 #ifdef VERBOSE_TRACES 1313 CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid, 1314 cb->ttag); 1315 #endif 1316 if (CHE_TAG_IS_FL(cb->ttag)) { 1317 uint16_t ttag; 1318 1319 ttag = CHE_RAW_FL_TAG(cb->ttag); 1320 MPASS(qp->open_fl_ttags[ttag] == cb); 1321 1322 /* Release this transfer tag. */ 1323 qp->open_fl_ttags[ttag] = NULL; 1324 qp->active_fl_ttags--; 1325 } else 1326 che_free_ddp_tag(qp, cb, cb->ttag); 1327 1328 cb->cc->active_r2ts--; 1329 1330 /* Schedule another R2T. */ 1331 ncb = nvmf_che_next_r2t(qp); 1332 mtx_unlock(&qp->rx_buffers.lock); 1333 if (ncb != NULL) 1334 che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, 1335 ncb->data_len); 1336 } 1337 1338 /* 1339 * Copy len bytes starting at offset skip from an mbuf chain into an 1340 * I/O buffer at destination offset io_offset. 1341 */ 1342 static void 1343 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, 1344 struct nvmf_io_request *io, u_int io_offset) 1345 { 1346 u_int todo; 1347 1348 while (m->m_len <= skip) { 1349 skip -= m->m_len; 1350 m = m->m_next; 1351 } 1352 while (len != 0) { 1353 MPASS((m->m_flags & M_EXTPG) == 0); 1354 1355 todo = min(m->m_len - skip, len); 1356 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); 1357 skip = 0; 1358 io_offset += todo; 1359 len -= todo; 1360 m = m->m_next; 1361 } 1362 } 1363 1364 static int 1365 nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1366 { 1367 const struct nvme_tcp_h2c_data_hdr *h2c; 1368 struct nvmf_che_command_buffer *cb; 1369 uint32_t data_len, data_offset; 1370 uint16_t ttag, fl_ttag; 1371 1372 h2c = (const void *)pdu->hdr; 1373 if (le32toh(h2c->datal) > qp->maxh2cdata) { 1374 nvmf_che_report_error(qp, 1375 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, 1376 pdu->m, pdu->hdr->hlen); 1377 nvmf_che_free_pdu(pdu); 1378 return (EBADMSG); 1379 } 1380 1381 ttag = le16toh(h2c->ttag); 1382 if (CHE_TAG_IS_FL(ttag)) { 1383 fl_ttag = CHE_RAW_FL_TAG(ttag); 1384 if (fl_ttag >= qp->num_fl_ttags) { 1385 nvmf_che_report_error(qp, 1386 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1387 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), 1388 pdu->m, pdu->hdr->hlen); 1389 nvmf_che_free_pdu(pdu); 1390 return (EBADMSG); 1391 } 1392 1393 mtx_lock(&qp->rx_buffers.lock); 1394 cb = qp->open_fl_ttags[fl_ttag]; 1395 } else { 1396 if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) { 1397 nvmf_che_report_error(qp, 1398 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1399 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), 1400 pdu->m, pdu->hdr->hlen); 1401 nvmf_che_free_pdu(pdu); 1402 return (EBADMSG); 1403 } 1404 1405 mtx_lock(&qp->rx_buffers.lock); 1406 cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)]; 1407 } 1408 1409 if (cb == NULL) { 1410 mtx_unlock(&qp->rx_buffers.lock); 1411 nvmf_che_report_error(qp, 1412 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1413 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, 1414 pdu->hdr->hlen); 1415 nvmf_che_free_pdu(pdu); 1416 return (EBADMSG); 1417 } 1418 MPASS(cb->ttag == ttag); 1419 1420 /* For a data digest mismatch, fail the I/O request. */ 1421 if (pdu->data_digest_mismatch) { 1422 nvmf_che_send_next_r2t(qp, cb); 1423 cb->error = EINTEGRITY; 1424 che_release_command_buffer(cb); 1425 nvmf_che_free_pdu(pdu); 1426 return (0); 1427 } 1428 1429 data_len = le32toh(h2c->datal); 1430 if (data_len != pdu->data_len) { 1431 mtx_unlock(&qp->rx_buffers.lock); 1432 nvmf_che_report_error(qp, 1433 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1434 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, 1435 pdu->hdr->hlen); 1436 nvmf_che_free_pdu(pdu); 1437 return (EBADMSG); 1438 } 1439 1440 data_offset = le32toh(h2c->datao); 1441 if (data_offset < cb->data_offset || 1442 data_offset + data_len > cb->data_offset + cb->data_len) { 1443 mtx_unlock(&qp->rx_buffers.lock); 1444 nvmf_che_report_error(qp, 1445 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, 1446 pdu->hdr->hlen); 1447 nvmf_che_free_pdu(pdu); 1448 return (EBADMSG); 1449 } 1450 1451 if (data_offset != cb->data_offset + cb->data_xfered) { 1452 if (CHE_TAG_IS_FL(ttag)) { 1453 mtx_unlock(&qp->rx_buffers.lock); 1454 nvmf_che_report_error(qp, 1455 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1456 pdu->hdr->hlen); 1457 nvmf_che_free_pdu(pdu); 1458 return (EBADMSG); 1459 } else { 1460 uint32_t ddp_bytes; 1461 1462 /* Account for PDUs silently received via DDP. */ 1463 ddp_bytes = data_offset - 1464 (cb->data_offset + cb->data_xfered); 1465 cb->data_xfered += ddp_bytes; 1466 #ifdef VERBOSE_TRACES 1467 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", 1468 __func__, qp->toep->tid, ddp_bytes); 1469 #endif 1470 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, 1471 ddp_bytes); 1472 } 1473 } 1474 1475 if ((cb->data_xfered + data_len == cb->data_len) != 1476 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { 1477 mtx_unlock(&qp->rx_buffers.lock); 1478 nvmf_che_report_error(qp, 1479 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1480 pdu->hdr->hlen); 1481 nvmf_che_free_pdu(pdu); 1482 return (EBADMSG); 1483 } 1484 1485 cb->data_xfered += data_len; 1486 data_offset -= cb->data_offset; 1487 if (cb->data_xfered == cb->data_len) { 1488 nvmf_che_send_next_r2t(qp, cb); 1489 } else { 1490 che_hold_command_buffer(cb); 1491 mtx_unlock(&qp->rx_buffers.lock); 1492 } 1493 1494 if (CHE_TAG_IS_FL(ttag)) 1495 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, 1496 data_offset); 1497 1498 che_release_command_buffer(cb); 1499 nvmf_che_free_pdu(pdu); 1500 return (0); 1501 } 1502 1503 static int 1504 nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1505 { 1506 const struct nvme_tcp_c2h_data_hdr *c2h; 1507 struct nvmf_che_command_buffer *cb; 1508 uint32_t data_len, data_offset; 1509 uint16_t cid, original_cid; 1510 1511 /* 1512 * Unlike freelist command buffers, DDP command buffers are 1513 * not released until the response capsule is received to keep 1514 * the STAG allocated until the command has completed. 1515 */ 1516 c2h = (const void *)pdu->hdr; 1517 1518 cid = le16toh(c2h->cccid); 1519 if (CHE_TAG_IS_FL(cid)) { 1520 mtx_lock(&qp->rx_buffers.lock); 1521 cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid); 1522 } else { 1523 if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) { 1524 nvmf_che_report_error(qp, 1525 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1526 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), 1527 pdu->m, pdu->hdr->hlen); 1528 nvmf_che_free_pdu(pdu); 1529 return (EBADMSG); 1530 } 1531 1532 mtx_lock(&qp->rx_buffers.lock); 1533 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; 1534 } 1535 1536 if (cb == NULL) { 1537 mtx_unlock(&qp->rx_buffers.lock); 1538 /* 1539 * XXX: Could be PDU sequence error if cccid is for a 1540 * command that doesn't use a command buffer. 1541 */ 1542 nvmf_che_report_error(qp, 1543 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1544 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, 1545 pdu->hdr->hlen); 1546 nvmf_che_free_pdu(pdu); 1547 return (EBADMSG); 1548 } 1549 1550 /* For a data digest mismatch, fail the I/O request. */ 1551 if (pdu->data_digest_mismatch) { 1552 cb->error = EINTEGRITY; 1553 if (CHE_TAG_IS_FL(cid)) { 1554 che_remove_command_buffer(&qp->rx_buffers, cb); 1555 mtx_unlock(&qp->rx_buffers.lock); 1556 che_release_command_buffer(cb); 1557 } else 1558 mtx_unlock(&qp->rx_buffers.lock); 1559 nvmf_che_free_pdu(pdu); 1560 return (0); 1561 } 1562 1563 data_len = le32toh(c2h->datal); 1564 if (data_len != pdu->data_len) { 1565 mtx_unlock(&qp->rx_buffers.lock); 1566 nvmf_che_report_error(qp, 1567 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1568 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, 1569 pdu->hdr->hlen); 1570 nvmf_che_free_pdu(pdu); 1571 return (EBADMSG); 1572 } 1573 1574 data_offset = le32toh(c2h->datao); 1575 if (data_offset < cb->data_offset || 1576 data_offset + data_len > cb->data_offset + cb->data_len) { 1577 mtx_unlock(&qp->rx_buffers.lock); 1578 nvmf_che_report_error(qp, 1579 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 1580 pdu->m, pdu->hdr->hlen); 1581 nvmf_che_free_pdu(pdu); 1582 return (EBADMSG); 1583 } 1584 1585 if (data_offset != cb->data_offset + cb->data_xfered) { 1586 if (CHE_TAG_IS_FL(cid)) { 1587 mtx_unlock(&qp->rx_buffers.lock); 1588 nvmf_che_report_error(qp, 1589 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1590 pdu->hdr->hlen); 1591 nvmf_che_free_pdu(pdu); 1592 return (EBADMSG); 1593 } else { 1594 uint32_t ddp_bytes; 1595 1596 /* Account for PDUs silently received via DDP. */ 1597 ddp_bytes = data_offset - 1598 (cb->data_offset + cb->data_xfered); 1599 cb->data_xfered += ddp_bytes; 1600 #ifdef VERBOSE_TRACES 1601 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", 1602 __func__, qp->toep->tid, ddp_bytes); 1603 #endif 1604 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, 1605 ddp_bytes); 1606 } 1607 } 1608 1609 if ((cb->data_xfered + data_len == cb->data_len) != 1610 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { 1611 mtx_unlock(&qp->rx_buffers.lock); 1612 nvmf_che_report_error(qp, 1613 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1614 pdu->hdr->hlen); 1615 nvmf_che_free_pdu(pdu); 1616 return (EBADMSG); 1617 } 1618 1619 cb->data_xfered += data_len; 1620 original_cid = cb->original_cid; 1621 1622 if (CHE_TAG_IS_FL(cid)) { 1623 data_offset -= cb->data_offset; 1624 if (cb->data_xfered == cb->data_len) 1625 che_remove_command_buffer(&qp->rx_buffers, cb); 1626 else 1627 che_hold_command_buffer(cb); 1628 mtx_unlock(&qp->rx_buffers.lock); 1629 1630 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 1631 /* 1632 * Free the CID as the command has now been 1633 * completed. 1634 */ 1635 cid = CHE_RAW_FL_TAG(cid); 1636 mtx_lock(&qp->fl_cid_lock); 1637 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); 1638 MPASS(original_cid == qp->fl_cids[cid]); 1639 FL_CID_FREE(cid, qp->fl_cid_set); 1640 mtx_unlock(&qp->fl_cid_lock); 1641 } 1642 1643 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, 1644 data_offset); 1645 1646 che_release_command_buffer(cb); 1647 } else { 1648 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 1649 /* 1650 * Free the command buffer and STAG as the 1651 * command has now been completed. 1652 */ 1653 che_free_ddp_tag(qp, cb, cid); 1654 mtx_unlock(&qp->rx_buffers.lock); 1655 che_release_command_buffer(cb); 1656 } else 1657 mtx_unlock(&qp->rx_buffers.lock); 1658 } 1659 1660 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { 1661 struct nvme_completion cqe; 1662 struct nvmf_capsule *nc; 1663 1664 memset(&cqe, 0, sizeof(cqe)); 1665 cqe.cid = original_cid; 1666 1667 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); 1668 nc->nc_sqhd_valid = false; 1669 1670 nvmf_capsule_received(&qp->qp, nc); 1671 } 1672 1673 nvmf_che_free_pdu(pdu); 1674 return (0); 1675 } 1676 1677 /* Called when m_free drops refcount to 0. */ 1678 static void 1679 nvmf_che_mbuf_done(struct mbuf *m) 1680 { 1681 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; 1682 1683 che_free_command_buffer(cb); 1684 } 1685 1686 static struct mbuf * 1687 nvmf_che_mbuf(void *arg, int how, void *data, size_t len) 1688 { 1689 struct nvmf_che_command_buffer *cb = arg; 1690 struct mbuf *m; 1691 1692 m = m_get(how, MT_DATA); 1693 m->m_flags |= M_RDONLY; 1694 m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL); 1695 m->m_len = len; 1696 return (m); 1697 } 1698 1699 static void 1700 nvmf_che_free_mext_pg(struct mbuf *m) 1701 { 1702 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; 1703 1704 M_ASSERTEXTPG(m); 1705 che_release_command_buffer(cb); 1706 } 1707 1708 static struct mbuf * 1709 nvmf_che_mext_pg(void *arg, int how) 1710 { 1711 struct nvmf_che_command_buffer *cb = arg; 1712 struct mbuf *m; 1713 1714 m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY); 1715 m->m_ext.ext_arg1 = cb; 1716 che_hold_command_buffer(cb); 1717 return (m); 1718 } 1719 1720 /* 1721 * Return an mbuf chain for a range of data belonging to a command 1722 * buffer. 1723 * 1724 * The mbuf chain uses M_EXT mbufs which hold references on the 1725 * command buffer so that it remains "alive" until the data has been 1726 * fully transmitted. If truncate_ok is true, then the mbuf chain 1727 * might return a short chain to avoid gratuitously splitting up a 1728 * page. 1729 */ 1730 static struct mbuf * 1731 nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb, 1732 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, 1733 bool can_truncate) 1734 { 1735 struct mbuf *m; 1736 size_t len; 1737 1738 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf, 1739 nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, 1740 can_truncate); 1741 if (actual_len != NULL) 1742 *actual_len = len; 1743 return (m); 1744 } 1745 1746 /* NB: cid and ttag and little-endian already. */ 1747 static void 1748 che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, 1749 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) 1750 { 1751 struct nvme_tcp_h2c_data_hdr h2c; 1752 struct mbuf *top; 1753 1754 memset(&h2c, 0, sizeof(h2c)); 1755 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; 1756 if (last_pdu) 1757 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; 1758 h2c.cccid = cid; 1759 h2c.ttag = ttag; 1760 h2c.datao = htole32(data_offset); 1761 h2c.datal = htole32(len); 1762 1763 top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len); 1764 nvmf_che_write_pdu(qp, top); 1765 } 1766 1767 static int 1768 nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1769 { 1770 const struct nvme_tcp_r2t_hdr *r2t; 1771 struct nvmf_che_command_buffer *cb; 1772 uint32_t data_len, data_offset; 1773 1774 r2t = (const void *)pdu->hdr; 1775 1776 mtx_lock(&qp->tx_buffers.lock); 1777 cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid); 1778 if (cb == NULL) { 1779 mtx_unlock(&qp->tx_buffers.lock); 1780 nvmf_che_report_error(qp, 1781 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1782 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, 1783 pdu->hdr->hlen); 1784 nvmf_che_free_pdu(pdu); 1785 return (EBADMSG); 1786 } 1787 1788 data_offset = le32toh(r2t->r2to); 1789 if (data_offset != cb->data_xfered) { 1790 mtx_unlock(&qp->tx_buffers.lock); 1791 nvmf_che_report_error(qp, 1792 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, 1793 pdu->hdr->hlen); 1794 nvmf_che_free_pdu(pdu); 1795 return (EBADMSG); 1796 } 1797 1798 /* 1799 * XXX: The spec does not specify how to handle R2T tranfers 1800 * out of range of the original command. 1801 */ 1802 data_len = le32toh(r2t->r2tl); 1803 if (data_offset + data_len > cb->data_len) { 1804 mtx_unlock(&qp->tx_buffers.lock); 1805 nvmf_che_report_error(qp, 1806 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, 1807 pdu->m, pdu->hdr->hlen); 1808 nvmf_che_free_pdu(pdu); 1809 return (EBADMSG); 1810 } 1811 1812 cb->data_xfered += data_len; 1813 if (cb->data_xfered == cb->data_len) 1814 che_remove_command_buffer(&qp->tx_buffers, cb); 1815 else 1816 che_hold_command_buffer(cb); 1817 mtx_unlock(&qp->tx_buffers.lock); 1818 1819 /* 1820 * Queue one or more H2C_DATA PDUs containing the requested 1821 * data. 1822 */ 1823 while (data_len > 0) { 1824 struct mbuf *m; 1825 uint32_t sent, todo; 1826 1827 todo = min(data_len, qp->max_tx_data); 1828 m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent, 1829 todo < data_len); 1830 che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, 1831 sent, sent == data_len); 1832 1833 data_offset += sent; 1834 data_len -= sent; 1835 } 1836 1837 che_release_command_buffer(cb); 1838 nvmf_che_free_pdu(pdu); 1839 return (0); 1840 } 1841 1842 static int 1843 nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1844 { 1845 /* 1846 * The PDU header should always be contiguous in the mbuf from 1847 * CPL_NVMT_CMP. 1848 */ 1849 pdu->hdr = mtod(pdu->m, void *); 1850 KASSERT(pdu->m->m_len == pdu->hdr->hlen + 1851 ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ? 1852 sizeof(uint32_t) : 0), 1853 ("%s: mismatched PDU header mbuf length", __func__)); 1854 1855 switch (pdu->hdr->pdu_type) { 1856 default: 1857 __assert_unreachable(); 1858 break; 1859 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: 1860 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1861 return (nvmf_che_handle_term_req(pdu)); 1862 case NVME_TCP_PDU_TYPE_CAPSULE_CMD: 1863 return (nvmf_che_save_command_capsule(qp, pdu)); 1864 case NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1865 return (nvmf_che_save_response_capsule(qp, pdu)); 1866 case NVME_TCP_PDU_TYPE_H2C_DATA: 1867 return (nvmf_che_handle_h2c_data(qp, pdu)); 1868 case NVME_TCP_PDU_TYPE_C2H_DATA: 1869 return (nvmf_che_handle_c2h_data(qp, pdu)); 1870 case NVME_TCP_PDU_TYPE_R2T: 1871 return (nvmf_che_handle_r2t(qp, pdu)); 1872 } 1873 } 1874 1875 static int 1876 nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) 1877 { 1878 struct socket *so = qp->so; 1879 struct mbuf *m, *n; 1880 uint32_t tcp_seq; 1881 size_t len; 1882 int error; 1883 1884 /* Check for DDP data. */ 1885 if (pdu->ddp) { 1886 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1); 1887 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, 1888 pdu->data_len); 1889 return (0); 1890 } 1891 1892 error = 0; 1893 len = pdu->data_len; 1894 tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq; 1895 m = pdu->m; 1896 SOCKBUF_LOCK(&so->so_rcv); 1897 while (len > 0) { 1898 n = mbufq_dequeue(&qp->rx_data); 1899 KASSERT(n != NULL, ("%s: missing %zu data", __func__, len)); 1900 if (n == NULL) { 1901 error = ENOBUFS; 1902 break; 1903 } 1904 1905 KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq, 1906 ("%s: TCP seq mismatch", __func__)); 1907 KASSERT(n->m_pkthdr.len <= len, 1908 ("%s: too much data", __func__)); 1909 if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq || 1910 n->m_pkthdr.len > len) { 1911 m_freem(n); 1912 error = ENOBUFS; 1913 break; 1914 } 1915 1916 #ifdef VERBOSE_TRACES 1917 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, 1918 qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq); 1919 #endif 1920 pdu->m->m_pkthdr.len += n->m_pkthdr.len; 1921 len -= n->m_pkthdr.len; 1922 tcp_seq += n->m_pkthdr.len; 1923 m_demote_pkthdr(n); 1924 m->m_next = n; 1925 m = m_last(n); 1926 } 1927 SOCKBUF_UNLOCK(&so->so_rcv); 1928 1929 if (error == 0) { 1930 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1); 1931 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets, 1932 pdu->data_len); 1933 } 1934 return (error); 1935 } 1936 1937 static void 1938 nvmf_che_receive(void *arg) 1939 { 1940 struct nvmf_che_qpair *qp = arg; 1941 struct socket *so = qp->so; 1942 struct nvmf_che_rxpdu pdu; 1943 struct mbuf *m; 1944 int error, terror; 1945 1946 SOCKBUF_LOCK(&so->so_rcv); 1947 while (!qp->rx_shutdown) { 1948 /* Wait for a PDU. */ 1949 if (so->so_error != 0 || so->so_rerror != 0) { 1950 if (so->so_error != 0) 1951 error = so->so_error; 1952 else 1953 error = so->so_rerror; 1954 SOCKBUF_UNLOCK(&so->so_rcv); 1955 error: 1956 nvmf_qpair_error(&qp->qp, error); 1957 SOCKBUF_LOCK(&so->so_rcv); 1958 while (!qp->rx_shutdown) 1959 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1960 break; 1961 } 1962 1963 m = mbufq_dequeue(&qp->rx_pdus); 1964 if (m == NULL) { 1965 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { 1966 error = 0; 1967 SOCKBUF_UNLOCK(&so->so_rcv); 1968 goto error; 1969 } 1970 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); 1971 continue; 1972 } 1973 SOCKBUF_UNLOCK(&so->so_rcv); 1974 1975 pdu.m = m; 1976 pdu.hdr = mtod(m, const void *); 1977 pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0; 1978 1979 error = nvmf_che_validate_pdu(qp, &pdu); 1980 if (error == 0 && pdu.data_len != 0) 1981 error = nvmf_che_attach_pdu_data(qp, &pdu); 1982 if (error != 0) 1983 nvmf_che_free_pdu(&pdu); 1984 else 1985 error = nvmf_che_dispatch_pdu(qp, &pdu); 1986 if (error != 0) { 1987 /* 1988 * If we received a termination request, close 1989 * the connection immediately. 1990 */ 1991 if (error == ECONNRESET) 1992 goto error; 1993 1994 /* 1995 * Wait for up to 30 seconds for the socket to 1996 * be closed by the other end. 1997 */ 1998 SOCKBUF_LOCK(&so->so_rcv); 1999 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2000 terror = cv_timedwait(&qp->rx_cv, 2001 SOCKBUF_MTX(&so->so_rcv), 30 * hz); 2002 if (terror == ETIMEDOUT) 2003 printf("NVMe/TCP: Timed out after sending terminate request\n"); 2004 } 2005 SOCKBUF_UNLOCK(&so->so_rcv); 2006 goto error; 2007 } 2008 2009 SOCKBUF_LOCK(&so->so_rcv); 2010 } 2011 SOCKBUF_UNLOCK(&so->so_rcv); 2012 kthread_exit(); 2013 } 2014 2015 static int 2016 nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag) 2017 { 2018 struct nvmf_che_qpair *qp = arg; 2019 2020 cv_signal(&qp->rx_cv); 2021 return (SU_OK); 2022 } 2023 2024 static int 2025 do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2026 { 2027 struct adapter *sc = iq->adapter; 2028 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; 2029 const struct cpl_nvmt_data *cpl; 2030 u_int tid; 2031 struct toepcb *toep; 2032 struct nvmf_che_qpair *qp; 2033 struct socket *so; 2034 struct inpcb *inp; 2035 struct tcpcb *tp; 2036 int len __diagused; 2037 2038 if (nca->nvmt_data_iqe) { 2039 cpl = (const void *)(rss + 1); 2040 } else { 2041 cpl = mtod(m, const void *); 2042 2043 /* strip off CPL header */ 2044 m_adj(m, sizeof(*cpl)); 2045 } 2046 tid = GET_TID(cpl); 2047 toep = lookup_tid(sc, tid); 2048 2049 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 2050 2051 len = m->m_pkthdr.len; 2052 2053 KASSERT(len == be16toh(cpl->length), 2054 ("%s: payload length mismatch", __func__)); 2055 2056 inp = toep->inp; 2057 tp = intotcpcb(inp); 2058 INP_WLOCK(inp); 2059 if (tp->t_flags & TF_DISCONNECTED) { 2060 CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), t_flags 0x%x", 2061 __func__, tid, len, tp->t_flags); 2062 INP_WUNLOCK(inp); 2063 m_freem(m); 2064 return (0); 2065 } 2066 2067 /* Save TCP sequence number. */ 2068 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); 2069 2070 qp = toep->ulpcb; 2071 so = qp->so; 2072 SOCKBUF_LOCK(&so->so_rcv); 2073 mbufq_enqueue(&qp->rx_data, m); 2074 SOCKBUF_UNLOCK(&so->so_rcv); 2075 2076 tp->t_rcvtime = ticks; 2077 2078 #ifdef VERBOSE_TRACES 2079 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, 2080 be32toh(cpl->seq)); 2081 #endif 2082 2083 INP_WUNLOCK(inp); 2084 return (0); 2085 } 2086 2087 static int 2088 do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2089 { 2090 struct adapter *sc = iq->adapter; 2091 const struct cpl_nvmt_cmp *cpl = mtod(m, const void *); 2092 u_int tid = GET_TID(cpl); 2093 struct toepcb *toep = lookup_tid(sc, tid); 2094 struct nvmf_che_qpair *qp = toep->ulpcb; 2095 struct socket *so = qp->so; 2096 struct inpcb *inp = toep->inp; 2097 struct tcpcb *tp = intotcpcb(inp); 2098 u_int hlen __diagused; 2099 bool empty; 2100 2101 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 2102 KASSERT(!(toep->flags & TPF_SYNQE), 2103 ("%s: toep %p claims to be a synq entry", __func__, toep)); 2104 2105 /* strip off CPL header */ 2106 m_adj(m, sizeof(*cpl)); 2107 hlen = m->m_pkthdr.len; 2108 2109 KASSERT(hlen == be16toh(cpl->length), 2110 ("%s: payload length mismatch", __func__)); 2111 2112 INP_WLOCK(inp); 2113 if (tp->t_flags & TF_DISCONNECTED) { 2114 CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), t_flags 0x%x", 2115 __func__, tid, hlen, tp->t_flags); 2116 INP_WUNLOCK(inp); 2117 m_freem(m); 2118 return (0); 2119 } 2120 2121 #ifdef VERBOSE_TRACES 2122 CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid, 2123 hlen, be32toh(cpl->seq), cpl->status); 2124 #endif 2125 2126 /* Save TCP sequence number and CPL status. */ 2127 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); 2128 m->m_pkthdr.nvmf_cpl_status = cpl->status; 2129 2130 SOCKBUF_LOCK(&so->so_rcv); 2131 empty = mbufq_len(&qp->rx_pdus) == 0; 2132 mbufq_enqueue(&qp->rx_pdus, m); 2133 SOCKBUF_UNLOCK(&so->so_rcv); 2134 INP_WUNLOCK(inp); 2135 if (empty) 2136 cv_signal(&qp->rx_cv); 2137 return (0); 2138 } 2139 2140 static uint16_t 2141 che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid) 2142 { 2143 uint16_t new_cid; 2144 2145 mtx_lock(&qp->fl_cid_lock); 2146 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid); 2147 if (new_cid == 0) { 2148 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0); 2149 MPASS(new_cid != 0); 2150 } 2151 new_cid--; 2152 FL_CID_BUSY(new_cid, qp->fl_cid_set); 2153 if (new_cid == CHE_MAX_FL_TAG) 2154 qp->next_cid = 0; 2155 else 2156 qp->next_cid = new_cid + 1; 2157 qp->fl_cids[new_cid] = original_cid; 2158 mtx_unlock(&qp->fl_cid_lock); 2159 2160 return (new_cid | CHE_FL_TAG_MASK); 2161 } 2162 2163 static uint16_t 2164 che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) 2165 { 2166 mtx_assert(&qp->rx_buffers.lock, MA_OWNED); 2167 2168 return (che_alloc_ddp_tag(qp, cb)); 2169 } 2170 2171 static struct mbuf * 2172 che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) 2173 { 2174 struct nvmf_capsule *nc = &cc->nc; 2175 struct nvmf_che_command_buffer *cb; 2176 struct nvme_sgl_descriptor *sgl; 2177 struct nvme_tcp_cmd cmd; 2178 struct mbuf *top, *m; 2179 uint16_t cid; 2180 bool use_icd; 2181 2182 use_icd = false; 2183 cb = NULL; 2184 m = NULL; 2185 2186 if (nc->nc_data.io_len != 0) { 2187 cb = che_alloc_command_buffer(qp, &nc->nc_data, 0, 2188 nc->nc_data.io_len, nc->nc_sqe.cid); 2189 cb->original_cid = nc->nc_sqe.cid; 2190 2191 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { 2192 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2193 use_icd = true; 2194 m = nvmf_che_command_buffer_mbuf(cb, 0, 2195 nc->nc_data.io_len, NULL, false); 2196 cb->data_xfered = nc->nc_data.io_len; 2197 che_release_command_buffer(cb); 2198 } else if (nc->nc_send_data) { 2199 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2200 cb->cid = htole16(cid); 2201 mtx_lock(&qp->tx_buffers.lock); 2202 che_add_command_buffer(&qp->tx_buffers, cb); 2203 mtx_unlock(&qp->tx_buffers.lock); 2204 } else { 2205 mtx_lock(&qp->rx_buffers.lock); 2206 cid = che_alloc_ddp_cid(qp, cb); 2207 if (cid == CHE_DDP_NO_TAG) { 2208 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2209 che_add_command_buffer(&qp->rx_buffers, cb); 2210 } 2211 cb->cid = htole16(cid); 2212 mtx_unlock(&qp->rx_buffers.lock); 2213 } 2214 } else 2215 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); 2216 2217 #ifdef VERBOSE_TRACES 2218 CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__, 2219 qp->toep->tid, cid, nc->nc_sqe.cid); 2220 #endif 2221 memset(&cmd, 0, sizeof(cmd)); 2222 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; 2223 cmd.ccsqe = nc->nc_sqe; 2224 cmd.ccsqe.cid = htole16(cid); 2225 2226 /* Populate SGL in SQE. */ 2227 sgl = &cmd.ccsqe.sgl; 2228 memset(sgl, 0, sizeof(*sgl)); 2229 sgl->address = 0; 2230 sgl->length = htole32(nc->nc_data.io_len); 2231 if (use_icd) { 2232 /* Use in-capsule data. */ 2233 sgl->type = NVME_SGL_TYPE_ICD; 2234 } else { 2235 /* Use a command buffer. */ 2236 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; 2237 } 2238 2239 top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? 2240 nc->nc_data.io_len : 0); 2241 return (top); 2242 } 2243 2244 static struct mbuf * 2245 che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) 2246 { 2247 struct nvmf_capsule *nc = &cc->nc; 2248 struct nvme_tcp_rsp rsp; 2249 2250 memset(&rsp, 0, sizeof(rsp)); 2251 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; 2252 rsp.rccqe = nc->nc_cqe; 2253 2254 return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); 2255 } 2256 2257 static struct mbuf * 2258 capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) 2259 { 2260 if (cc->nc.nc_qe_len == sizeof(struct nvme_command)) 2261 return (che_command_pdu(qp, cc)); 2262 else 2263 return (che_response_pdu(qp, cc)); 2264 } 2265 2266 static void 2267 nvmf_che_send(void *arg) 2268 { 2269 struct nvmf_che_qpair *qp = arg; 2270 struct nvmf_che_capsule *cc; 2271 struct socket *so = qp->so; 2272 struct mbuf *m; 2273 int error; 2274 2275 m = NULL; 2276 SOCKBUF_LOCK(&so->so_snd); 2277 while (!qp->tx_shutdown) { 2278 if (so->so_error != 0) { 2279 error = so->so_error; 2280 SOCKBUF_UNLOCK(&so->so_snd); 2281 m_freem(m); 2282 nvmf_qpair_error(&qp->qp, error); 2283 SOCKBUF_LOCK(&so->so_snd); 2284 while (!qp->tx_shutdown) 2285 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 2286 break; 2287 } 2288 2289 if (STAILQ_EMPTY(&qp->tx_capsules)) { 2290 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); 2291 continue; 2292 } 2293 2294 /* Convert a capsule into a PDU. */ 2295 cc = STAILQ_FIRST(&qp->tx_capsules); 2296 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); 2297 SOCKBUF_UNLOCK(&so->so_snd); 2298 2299 m = capsule_to_pdu(qp, cc); 2300 che_release_capsule(cc); 2301 2302 nvmf_che_write_pdu(qp, m); 2303 2304 SOCKBUF_LOCK(&so->so_snd); 2305 } 2306 SOCKBUF_UNLOCK(&so->so_snd); 2307 kthread_exit(); 2308 } 2309 2310 static int 2311 nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace) 2312 { 2313 struct sockopt opt; 2314 int error, one = 1; 2315 2316 /* Don't lower the buffer sizes, just enforce a minimum. */ 2317 SOCKBUF_LOCK(&so->so_snd); 2318 if (sspace < so->so_snd.sb_hiwat) 2319 sspace = so->so_snd.sb_hiwat; 2320 SOCKBUF_UNLOCK(&so->so_snd); 2321 SOCKBUF_LOCK(&so->so_rcv); 2322 if (rspace < so->so_rcv.sb_hiwat) 2323 rspace = so->so_rcv.sb_hiwat; 2324 SOCKBUF_UNLOCK(&so->so_rcv); 2325 2326 error = soreserve(so, sspace, rspace); 2327 if (error != 0) 2328 return (error); 2329 SOCKBUF_LOCK(&so->so_snd); 2330 so->so_snd.sb_flags |= SB_AUTOSIZE; 2331 SOCKBUF_UNLOCK(&so->so_snd); 2332 SOCKBUF_LOCK(&so->so_rcv); 2333 so->so_rcv.sb_flags |= SB_AUTOSIZE; 2334 SOCKBUF_UNLOCK(&so->so_rcv); 2335 2336 /* 2337 * Disable Nagle. 2338 */ 2339 bzero(&opt, sizeof(opt)); 2340 opt.sopt_dir = SOPT_SET; 2341 opt.sopt_level = IPPROTO_TCP; 2342 opt.sopt_name = TCP_NODELAY; 2343 opt.sopt_val = &one; 2344 opt.sopt_valsize = sizeof(one); 2345 error = sosetopt(so, &opt); 2346 if (error != 0) 2347 return (error); 2348 2349 return (0); 2350 } 2351 2352 static void 2353 t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, 2354 uint64_t val) 2355 { 2356 struct adapter *sc = td_adapter(toep->td); 2357 2358 t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0); 2359 } 2360 2361 static void 2362 set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda) 2363 { 2364 uint64_t val; 2365 2366 CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u", 2367 __func__, toep->tid, ulp_submode, rxpda); 2368 2369 val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode); 2370 t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE, 2371 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val); 2372 2373 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); 2374 t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val); 2375 2376 val = V_TCB_RSVD((rxpda / 4) - 1); 2377 t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val); 2378 2379 /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */ 2380 val = 0; 2381 t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ, 2382 V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val); 2383 } 2384 2385 static u_int 2386 pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen, 2387 uint8_t pda) 2388 { 2389 u_int max_data_len; 2390 2391 if (nvlist_get_bool(nvl, "header_digests")) 2392 hlen += sizeof(uint32_t); 2393 hlen = roundup(hlen, pda); 2394 max_data_len = max_pdu_len - hlen; 2395 if (nvlist_get_bool(nvl, "data_digests")) 2396 max_data_len -= sizeof(uint32_t); 2397 return (max_data_len); 2398 } 2399 2400 static struct nvmf_qpair * 2401 che_allocate_qpair(bool controller, const nvlist_t *nvl) 2402 { 2403 struct nvmf_che_adapter *nca; 2404 struct nvmf_che_qpair *qp; 2405 struct adapter *sc; 2406 struct file *fp; 2407 struct socket *so; 2408 struct inpcb *inp; 2409 struct tcpcb *tp; 2410 struct toepcb *toep; 2411 cap_rights_t rights; 2412 u_int max_tx_pdu_len, num_ddp_tags; 2413 int error, ulp_submode; 2414 2415 if (!nvlist_exists_number(nvl, "fd") || 2416 !nvlist_exists_number(nvl, "rxpda") || 2417 !nvlist_exists_number(nvl, "txpda") || 2418 !nvlist_exists_bool(nvl, "header_digests") || 2419 !nvlist_exists_bool(nvl, "data_digests") || 2420 !nvlist_exists_number(nvl, "maxr2t") || 2421 !nvlist_exists_number(nvl, "maxh2cdata") || 2422 !nvlist_exists_number(nvl, "max_icd")) 2423 return (NULL); 2424 2425 error = fget(curthread, nvlist_get_number(nvl, "fd"), 2426 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); 2427 if (error != 0) 2428 return (NULL); 2429 if (fp->f_type != DTYPE_SOCKET) { 2430 fdrop(fp, curthread); 2431 return (NULL); 2432 } 2433 so = fp->f_data; 2434 if (so->so_type != SOCK_STREAM || 2435 so->so_proto->pr_protocol != IPPROTO_TCP) { 2436 fdrop(fp, curthread); 2437 return (NULL); 2438 } 2439 2440 sc = find_offload_adapter(so); 2441 if (sc == NULL) { 2442 fdrop(fp, curthread); 2443 return (NULL); 2444 } 2445 nca = sc->nvme_ulp_softc; 2446 2447 /* 2448 * Controller: Require advertised MAXH2CDATA to be small 2449 * enough. 2450 */ 2451 if (controller) { 2452 u_int max_rx_data; 2453 2454 max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, 2455 sizeof(struct nvme_tcp_h2c_data_hdr), 2456 nvlist_get_number(nvl, "rxpda")); 2457 if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) { 2458 fdrop(fp, curthread); 2459 return (NULL); 2460 } 2461 } 2462 2463 /* 2464 * Host: Require the queue size to be small enough that all of 2465 * the command ids allocated by nvmf(4) will fit in the 2466 * unallocated range. 2467 * 2468 * XXX: Alternatively this driver could just queue commands 2469 * when an unallocated ID isn't available. 2470 */ 2471 if (!controller) { 2472 u_int num_commands; 2473 2474 num_commands = nvlist_get_number(nvl, "qsize") - 1; 2475 if (nvlist_get_bool(nvl, "admin")) 2476 num_commands += 8; /* Max AER */ 2477 if (num_commands > CHE_NUM_FL_TAGS) { 2478 fdrop(fp, curthread); 2479 return (NULL); 2480 } 2481 } 2482 2483 qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO); 2484 qp->txpda = nvlist_get_number(nvl, "txpda"); 2485 qp->rxpda = nvlist_get_number(nvl, "rxpda"); 2486 qp->header_digests = nvlist_get_bool(nvl, "header_digests"); 2487 qp->data_digests = nvlist_get_bool(nvl, "data_digests"); 2488 qp->maxr2t = nvlist_get_number(nvl, "maxr2t"); 2489 if (controller) 2490 qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata"); 2491 2492 if (controller) { 2493 /* NB: maxr2t is 0's based. */ 2494 qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS, 2495 nvlist_get_number(nvl, "qsize") * 2496 ((uint64_t)qp->maxr2t + 1)); 2497 qp->open_fl_ttags = mallocarray(qp->num_fl_ttags, 2498 sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO); 2499 } else { 2500 qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS, 2501 sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO); 2502 qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE, 2503 M_WAITOK); 2504 FL_CID_INIT(qp->fl_cid_set); 2505 mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF); 2506 } 2507 2508 inp = sotoinpcb(so); 2509 INP_WLOCK(inp); 2510 tp = intotcpcb(inp); 2511 if (tp->t_flags & TF_DISCONNECTED) { 2512 INP_WUNLOCK(inp); 2513 free(qp->fl_cid_set, M_NVMF_CHE); 2514 free(qp->fl_cids, M_NVMF_CHE); 2515 free(qp->open_fl_ttags, M_NVMF_CHE); 2516 free(qp, M_NVMF_CHE); 2517 fdrop(fp, curthread); 2518 return (NULL); 2519 } 2520 2521 MPASS(tp->t_flags & TF_TOE); 2522 MPASS(tp->tod != NULL); 2523 MPASS(tp->t_toe != NULL); 2524 toep = tp->t_toe; 2525 MPASS(toep->vi->adapter == sc); 2526 2527 if (ulp_mode(toep) != ULP_MODE_NONE) { 2528 INP_WUNLOCK(inp); 2529 free(qp->fl_cid_set, M_NVMF_CHE); 2530 free(qp->fl_cids, M_NVMF_CHE); 2531 free(qp->open_fl_ttags, M_NVMF_CHE); 2532 free(qp, M_NVMF_CHE); 2533 fdrop(fp, curthread); 2534 return (NULL); 2535 } 2536 2537 /* Claim socket from file descriptor. */ 2538 fp->f_ops = &badfileops; 2539 fp->f_data = NULL; 2540 2541 qp->so = so; 2542 qp->toep = toep; 2543 qp->nca = nca; 2544 refcount_init(&qp->refs, 1); 2545 2546 /* NB: C2H and H2C headers are the same size. */ 2547 qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, 2548 sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda); 2549 qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu, 2550 sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda); 2551 if (!controller) { 2552 qp->max_tx_data = min(qp->max_tx_data, 2553 nvlist_get_number(nvl, "maxh2cdata")); 2554 qp->max_icd = min(nvlist_get_number(nvl, "max_icd"), 2555 pdu_max_data_len(nvl, nca->max_transmit_pdu, 2556 sizeof(struct nvme_tcp_cmd), qp->txpda)); 2557 } else { 2558 /* 2559 * IOCCSZ represents the size of a logical command 2560 * capsule including the 64 byte SQE and the 2561 * in-capsule data. Use pdu_max_data_len to compute 2562 * the maximum supported ICD length. 2563 */ 2564 qp->max_ioccsz = rounddown(pdu_max_data_len(nvl, 2565 nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd), 2566 qp->rxpda), 16) + sizeof(struct nvme_command); 2567 } 2568 2569 ulp_submode = 0; 2570 if (qp->header_digests) 2571 ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC; 2572 if (qp->data_digests) 2573 ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC; 2574 if (!controller) 2575 ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR; 2576 2577 max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr); 2578 if (qp->header_digests) 2579 max_tx_pdu_len += sizeof(uint32_t); 2580 max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda); 2581 max_tx_pdu_len += qp->max_tx_data; 2582 if (qp->data_digests) 2583 max_tx_pdu_len += sizeof(uint32_t); 2584 2585 /* TODO: ISO limits */ 2586 2587 if (controller) { 2588 /* Use the SUCCESS flag if SQ flow control is disabled. */ 2589 qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control"); 2590 } 2591 2592 toep->params.ulp_mode = ULP_MODE_NVMET; 2593 toep->ulpcb = qp; 2594 2595 send_txdataplen_max_flowc_wr(sc, toep, 2596 roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg)); 2597 set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda); 2598 INP_WUNLOCK(inp); 2599 2600 fdrop(fp, curthread); 2601 2602 error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu); 2603 if (error != 0) { 2604 free(qp->fl_cid_set, M_NVMF_CHE); 2605 free(qp->fl_cids, M_NVMF_CHE); 2606 free(qp->open_fl_ttags, M_NVMF_CHE); 2607 free(qp, M_NVMF_CHE); 2608 return (NULL); 2609 } 2610 2611 num_ddp_tags = ddp_tags_per_qp; 2612 if (num_ddp_tags > 0) { 2613 qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags); 2614 if (qp->tpt_offset != T4_STAG_UNSET) { 2615 #ifdef VERBOSE_TRACES 2616 CTR(KTR_CXGBE, 2617 "%s: tid %u using %u tags at offset 0x%x", 2618 __func__, toep->tid, num_ddp_tags, qp->tpt_offset); 2619 #endif 2620 qp->num_ddp_tags = num_ddp_tags; 2621 qp->open_ddp_tags = mallocarray(qp->num_ddp_tags, 2622 sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK | 2623 M_ZERO); 2624 2625 t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET, 2626 M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset)); 2627 } 2628 } 2629 2630 TAILQ_INIT(&qp->rx_buffers.head); 2631 TAILQ_INIT(&qp->tx_buffers.head); 2632 mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF); 2633 mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF); 2634 2635 cv_init(&qp->rx_cv, "-"); 2636 cv_init(&qp->tx_cv, "-"); 2637 mbufq_init(&qp->rx_data, 0); 2638 mbufq_init(&qp->rx_pdus, 0); 2639 STAILQ_INIT(&qp->tx_capsules); 2640 2641 /* Register socket upcall for receive to handle remote FIN. */ 2642 SOCKBUF_LOCK(&so->so_rcv); 2643 soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp); 2644 SOCKBUF_UNLOCK(&so->so_rcv); 2645 2646 /* Spin up kthreads. */ 2647 error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0, 2648 "nvmef che rx"); 2649 if (error != 0) { 2650 che_free_qpair(&qp->qp); 2651 return (NULL); 2652 } 2653 error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0, 2654 "nvmef che tx"); 2655 if (error != 0) { 2656 che_free_qpair(&qp->qp); 2657 return (NULL); 2658 } 2659 2660 return (&qp->qp); 2661 } 2662 2663 static void 2664 che_release_qpair(struct nvmf_che_qpair *qp) 2665 { 2666 if (refcount_release(&qp->refs)) 2667 free(qp, M_NVMF_CHE); 2668 } 2669 2670 static void 2671 che_free_qpair(struct nvmf_qpair *nq) 2672 { 2673 struct nvmf_che_qpair *qp = CQP(nq); 2674 struct nvmf_che_command_buffer *ncb, *cb; 2675 struct nvmf_che_capsule *ncc, *cc; 2676 struct socket *so = qp->so; 2677 struct toepcb *toep = qp->toep; 2678 struct inpcb *inp = sotoinpcb(so); 2679 2680 /* Shut down kthreads. */ 2681 SOCKBUF_LOCK(&so->so_snd); 2682 qp->tx_shutdown = true; 2683 if (qp->tx_thread != NULL) { 2684 cv_signal(&qp->tx_cv); 2685 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, 2686 "nvchetx", 0); 2687 } 2688 SOCKBUF_UNLOCK(&so->so_snd); 2689 2690 SOCKBUF_LOCK(&so->so_rcv); 2691 qp->rx_shutdown = true; 2692 if (qp->rx_thread != NULL) { 2693 cv_signal(&qp->rx_cv); 2694 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, 2695 "nvcherx", 0); 2696 } 2697 soupcall_clear(so, SO_RCV); 2698 SOCKBUF_UNLOCK(&so->so_rcv); 2699 mbufq_drain(&qp->rx_data); 2700 mbufq_drain(&qp->rx_pdus); 2701 2702 STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) { 2703 nvmf_abort_capsule_data(&cc->nc, ECONNABORTED); 2704 che_release_capsule(cc); 2705 } 2706 2707 cv_destroy(&qp->tx_cv); 2708 cv_destroy(&qp->rx_cv); 2709 2710 if (qp->open_fl_ttags != NULL) { 2711 for (u_int i = 0; i < qp->num_fl_ttags; i++) { 2712 cb = qp->open_fl_ttags[i]; 2713 if (cb != NULL) { 2714 cb->cc->active_r2ts--; 2715 cb->error = ECONNABORTED; 2716 che_release_command_buffer(cb); 2717 } 2718 } 2719 free(qp->open_fl_ttags, M_NVMF_CHE); 2720 } 2721 if (qp->num_ddp_tags != 0) { 2722 for (u_int i = 0; i < qp->num_ddp_tags; i++) { 2723 cb = qp->open_ddp_tags[i]; 2724 if (cb != NULL) { 2725 if (cb->cc != NULL) 2726 cb->cc->active_r2ts--; 2727 cb->error = ECONNABORTED; 2728 mtx_lock(&qp->rx_buffers.lock); 2729 che_free_ddp_tag(qp, cb, cb->ttag); 2730 mtx_unlock(&qp->rx_buffers.lock); 2731 che_release_command_buffer(cb); 2732 } 2733 } 2734 free(qp->open_ddp_tags, M_NVMF_CHE); 2735 } 2736 2737 mtx_lock(&qp->rx_buffers.lock); 2738 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { 2739 che_remove_command_buffer(&qp->rx_buffers, cb); 2740 mtx_unlock(&qp->rx_buffers.lock); 2741 #ifdef INVARIANTS 2742 if (cb->cc != NULL) 2743 cb->cc->pending_r2ts--; 2744 #endif 2745 cb->error = ECONNABORTED; 2746 che_release_command_buffer(cb); 2747 mtx_lock(&qp->rx_buffers.lock); 2748 } 2749 mtx_destroy(&qp->rx_buffers.lock); 2750 2751 mtx_lock(&qp->tx_buffers.lock); 2752 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { 2753 che_remove_command_buffer(&qp->tx_buffers, cb); 2754 mtx_unlock(&qp->tx_buffers.lock); 2755 cb->error = ECONNABORTED; 2756 che_release_command_buffer(cb); 2757 mtx_lock(&qp->tx_buffers.lock); 2758 } 2759 mtx_destroy(&qp->tx_buffers.lock); 2760 2761 if (qp->num_ddp_tags != 0) 2762 t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags); 2763 2764 if (!qp->qp.nq_controller) { 2765 free(qp->fl_cids, M_NVMF_CHE); 2766 free(qp->fl_cid_set, M_NVMF_CHE); 2767 mtx_destroy(&qp->fl_cid_lock); 2768 } 2769 2770 INP_WLOCK(inp); 2771 toep->ulpcb = NULL; 2772 mbufq_drain(&toep->ulp_pduq); 2773 2774 /* 2775 * Grab a reference to use when waiting for the final CPL to 2776 * be received. If toep->inp is NULL, then 2777 * final_cpl_received() has already been called (e.g. due to 2778 * the peer sending a RST). 2779 */ 2780 if (toep->inp != NULL) { 2781 toep = hold_toepcb(toep); 2782 toep->flags |= TPF_WAITING_FOR_FINAL; 2783 } else 2784 toep = NULL; 2785 INP_WUNLOCK(inp); 2786 2787 soclose(so); 2788 2789 /* 2790 * Wait for the socket to fully close. This ensures any 2791 * pending received data has been received (and in particular, 2792 * any data that would be received by DDP has been handled). 2793 */ 2794 if (toep != NULL) { 2795 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); 2796 2797 mtx_lock(lock); 2798 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) 2799 mtx_sleep(toep, lock, PSOCK, "conclo2", 0); 2800 mtx_unlock(lock); 2801 free_toepcb(toep); 2802 } 2803 2804 che_release_qpair(qp); 2805 } 2806 2807 static uint32_t 2808 che_max_ioccsz(struct nvmf_qpair *nq) 2809 { 2810 struct nvmf_che_qpair *qp = CQP(nq); 2811 2812 /* 2813 * Limit the command capsule size so that with maximum ICD it 2814 * fits within the limit of the largest PDU the adapter can 2815 * receive. 2816 */ 2817 return (qp->max_ioccsz); 2818 } 2819 2820 static uint64_t 2821 che_max_xfer_size(struct nvmf_qpair *nq) 2822 { 2823 struct nvmf_che_qpair *qp = CQP(nq); 2824 2825 /* 2826 * Limit host transfers to the size of the data payload in the 2827 * largest PDU the adapter can receive. 2828 */ 2829 return (qp->max_rx_data); 2830 } 2831 2832 static struct nvmf_capsule * 2833 che_allocate_capsule(struct nvmf_qpair *nq, int how) 2834 { 2835 struct nvmf_che_qpair *qp = CQP(nq); 2836 struct nvmf_che_capsule *cc; 2837 2838 cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO); 2839 if (cc == NULL) 2840 return (NULL); 2841 refcount_init(&cc->refs, 1); 2842 refcount_acquire(&qp->refs); 2843 return (&cc->nc); 2844 } 2845 2846 static void 2847 che_release_capsule(struct nvmf_che_capsule *cc) 2848 { 2849 struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair); 2850 2851 if (!refcount_release(&cc->refs)) 2852 return; 2853 2854 MPASS(cc->active_r2ts == 0); 2855 MPASS(cc->pending_r2ts == 0); 2856 2857 nvmf_che_free_pdu(&cc->rx_pdu); 2858 free(cc, M_NVMF_CHE); 2859 che_release_qpair(qp); 2860 } 2861 2862 static void 2863 che_free_capsule(struct nvmf_capsule *nc) 2864 { 2865 che_release_capsule(CCAP(nc)); 2866 } 2867 2868 static int 2869 che_transmit_capsule(struct nvmf_capsule *nc) 2870 { 2871 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); 2872 struct nvmf_che_capsule *cc = CCAP(nc); 2873 struct socket *so = qp->so; 2874 2875 refcount_acquire(&cc->refs); 2876 SOCKBUF_LOCK(&so->so_snd); 2877 STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link); 2878 cv_signal(&qp->tx_cv); 2879 SOCKBUF_UNLOCK(&so->so_snd); 2880 return (0); 2881 } 2882 2883 static uint8_t 2884 che_validate_command_capsule(struct nvmf_capsule *nc) 2885 { 2886 struct nvmf_che_capsule *cc = CCAP(nc); 2887 struct nvme_sgl_descriptor *sgl; 2888 2889 KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received")); 2890 2891 sgl = &nc->nc_sqe.sgl; 2892 switch (sgl->type) { 2893 case NVME_SGL_TYPE_ICD: 2894 if (cc->rx_pdu.data_len != le32toh(sgl->length)) { 2895 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); 2896 return (NVME_SC_DATA_SGL_LENGTH_INVALID); 2897 } 2898 break; 2899 case NVME_SGL_TYPE_COMMAND_BUFFER: 2900 if (cc->rx_pdu.data_len != 0) { 2901 printf("NVMe/TCP: Command Buffer SGL with ICD\n"); 2902 return (NVME_SC_INVALID_FIELD); 2903 } 2904 break; 2905 default: 2906 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); 2907 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); 2908 } 2909 2910 if (sgl->address != 0) { 2911 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); 2912 return (NVME_SC_SGL_OFFSET_INVALID); 2913 } 2914 2915 return (NVME_SC_SUCCESS); 2916 } 2917 2918 static size_t 2919 che_capsule_data_len(const struct nvmf_capsule *nc) 2920 { 2921 MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); 2922 return (le32toh(nc->nc_sqe.sgl.length)); 2923 } 2924 2925 static void 2926 che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, 2927 struct nvmf_io_request *io) 2928 { 2929 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); 2930 struct nvmf_che_capsule *cc = CCAP(nc); 2931 struct nvmf_che_command_buffer *cb; 2932 2933 cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len, 2934 nc->nc_sqe.cid); 2935 2936 cb->cc = cc; 2937 refcount_acquire(&cc->refs); 2938 2939 /* 2940 * If this command has too many active R2Ts or there are no 2941 * available transfer tags, queue the request for later. 2942 * 2943 * NB: maxr2t is 0's based. 2944 */ 2945 mtx_lock(&qp->rx_buffers.lock); 2946 if (cc->active_r2ts > qp->maxr2t || 2947 !nvmf_che_allocate_ttag(qp, cb)) { 2948 #ifdef INVARIANTS 2949 cc->pending_r2ts++; 2950 #endif 2951 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); 2952 mtx_unlock(&qp->rx_buffers.lock); 2953 return; 2954 } 2955 mtx_unlock(&qp->rx_buffers.lock); 2956 2957 che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); 2958 } 2959 2960 static void 2961 che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, 2962 struct nvmf_io_request *io) 2963 { 2964 struct nvmf_che_capsule *cc = CCAP(nc); 2965 2966 /* 2967 * The header is in rx_pdu.m, the padding is discarded, and 2968 * the data starts at rx_pdu.m->m_next. 2969 */ 2970 mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0); 2971 nvmf_complete_io_request(io, io->io_len, 0); 2972 } 2973 2974 static int 2975 che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 2976 struct nvmf_io_request *io) 2977 { 2978 struct nvme_sgl_descriptor *sgl; 2979 size_t data_len; 2980 2981 if (nc->nc_qe_len != sizeof(struct nvme_command) || 2982 !nc->nc_qpair->nq_controller) 2983 return (EINVAL); 2984 2985 sgl = &nc->nc_sqe.sgl; 2986 data_len = le32toh(sgl->length); 2987 if (data_offset + io->io_len > data_len) 2988 return (EFBIG); 2989 2990 if (sgl->type == NVME_SGL_TYPE_ICD) 2991 che_receive_icd_data(nc, data_offset, io); 2992 else 2993 che_receive_r2t_data(nc, data_offset, io); 2994 return (0); 2995 } 2996 2997 /* NB: cid is little-endian already. */ 2998 static void 2999 che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset, 3000 struct mbuf *m, size_t len, bool last_pdu, bool success) 3001 { 3002 struct nvme_tcp_c2h_data_hdr c2h; 3003 struct mbuf *top; 3004 3005 memset(&c2h, 0, sizeof(c2h)); 3006 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; 3007 if (last_pdu) 3008 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; 3009 if (success) 3010 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; 3011 c2h.cccid = cid; 3012 c2h.datao = htole32(data_offset); 3013 c2h.datal = htole32(len); 3014 3015 top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len); 3016 nvmf_che_write_pdu(qp, top); 3017 } 3018 3019 static u_int 3020 che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 3021 struct mbuf *m, size_t len) 3022 { 3023 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); 3024 struct nvme_sgl_descriptor *sgl; 3025 uint32_t data_len; 3026 bool last_pdu, last_xfer; 3027 3028 if (nc->nc_qe_len != sizeof(struct nvme_command) || 3029 !qp->qp.nq_controller) { 3030 m_freem(m); 3031 return (NVME_SC_INVALID_FIELD); 3032 } 3033 3034 sgl = &nc->nc_sqe.sgl; 3035 data_len = le32toh(sgl->length); 3036 if (data_offset + len > data_len) { 3037 m_freem(m); 3038 return (NVME_SC_INVALID_FIELD); 3039 } 3040 last_xfer = (data_offset + len == data_len); 3041 3042 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { 3043 m_freem(m); 3044 return (NVME_SC_INVALID_FIELD); 3045 } 3046 3047 KASSERT(data_offset == CCAP(nc)->tx_data_offset, 3048 ("%s: starting data_offset %u doesn't match end of previous xfer %u", 3049 __func__, data_offset, CCAP(nc)->tx_data_offset)); 3050 3051 /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */ 3052 while (m != NULL) { 3053 struct mbuf *n; 3054 uint32_t todo; 3055 3056 if (m->m_len > qp->max_tx_data) { 3057 n = m_split(m, qp->max_tx_data, M_WAITOK); 3058 todo = m->m_len; 3059 } else { 3060 struct mbuf *p; 3061 3062 todo = m->m_len; 3063 p = m; 3064 n = p->m_next; 3065 while (n != NULL) { 3066 if (todo + n->m_len > qp->max_tx_data) { 3067 p->m_next = NULL; 3068 break; 3069 } 3070 todo += n->m_len; 3071 p = n; 3072 n = p->m_next; 3073 } 3074 MPASS(m_length(m, NULL) == todo); 3075 } 3076 3077 last_pdu = (n == NULL && last_xfer); 3078 che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, 3079 last_pdu, last_pdu && qp->send_success); 3080 3081 data_offset += todo; 3082 data_len -= todo; 3083 m = n; 3084 } 3085 MPASS(data_len == 0); 3086 3087 #ifdef INVARIANTS 3088 CCAP(nc)->tx_data_offset = data_offset; 3089 #endif 3090 if (!last_xfer) 3091 return (NVMF_MORE); 3092 else if (qp->send_success) 3093 return (NVMF_SUCCESS_SENT); 3094 else 3095 return (NVME_SC_SUCCESS); 3096 } 3097 3098 struct nvmf_transport_ops che_ops = { 3099 .allocate_qpair = che_allocate_qpair, 3100 .free_qpair = che_free_qpair, 3101 .max_ioccsz = che_max_ioccsz, 3102 .max_xfer_size = che_max_xfer_size, 3103 .allocate_capsule = che_allocate_capsule, 3104 .free_capsule = che_free_capsule, 3105 .transmit_capsule = che_transmit_capsule, 3106 .validate_command_capsule = che_validate_command_capsule, 3107 .capsule_data_len = che_capsule_data_len, 3108 .receive_controller_data = che_receive_controller_data, 3109 .send_controller_data = che_send_controller_data, 3110 .trtype = NVMF_TRTYPE_TCP, 3111 .priority = 10, 3112 }; 3113 3114 NVMF_TRANSPORT(che, che_ops); 3115 3116 static void 3117 read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len, 3118 uint32_t *max_rx_pdu_len) 3119 { 3120 uint32_t tx_len, rx_len, r, v; 3121 3122 /* Copied from cxgbei, but not sure if this is correct. */ 3123 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); 3124 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); 3125 3126 r = t4_read_reg(sc, A_TP_PARA_REG2); 3127 rx_len = min(rx_len, G_MAXRXDATA(r)); 3128 tx_len = min(tx_len, G_MAXRXDATA(r)); 3129 3130 r = t4_read_reg(sc, A_TP_PARA_REG7); 3131 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); 3132 rx_len = min(rx_len, v); 3133 tx_len = min(tx_len, v); 3134 3135 /* Cannot be larger than 32KB - 256. */ 3136 rx_len = min(rx_len, 32512); 3137 tx_len = min(tx_len, 32512); 3138 3139 *max_tx_pdu_len = tx_len; 3140 *max_rx_pdu_len = rx_len; 3141 } 3142 3143 static int 3144 nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca) 3145 { 3146 struct sysctl_oid *oid; 3147 struct sysctl_oid_list *children; 3148 uint32_t val; 3149 3150 read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu); 3151 if (nca->max_transmit_pdu > che_max_transmit_pdu) 3152 nca->max_transmit_pdu = che_max_transmit_pdu; 3153 if (nca->max_receive_pdu > che_max_receive_pdu) 3154 nca->max_receive_pdu = che_max_receive_pdu; 3155 val = t4_read_reg(sc, A_SGE_CONTROL2); 3156 nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0; 3157 3158 sysctl_ctx_init(&nca->ctx); 3159 oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */ 3160 children = SYSCTL_CHILDREN(oid); 3161 3162 oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme", 3163 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings"); 3164 children = SYSCTL_CHILDREN(oid); 3165 3166 nca->ddp_threshold = 8192; 3167 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold", 3168 CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold"); 3169 3170 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu", 3171 CTLFLAG_RW, &nca->max_transmit_pdu, 0, 3172 "Maximum size of a transmitted PDU"); 3173 3174 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu", 3175 CTLFLAG_RW, &nca->max_receive_pdu, 0, 3176 "Maximum size of a received PDU"); 3177 3178 return (0); 3179 } 3180 3181 static void 3182 nvmf_che_destroy(struct nvmf_che_adapter *nca) 3183 { 3184 sysctl_ctx_free(&nca->ctx); 3185 free(nca, M_CXGBE); 3186 } 3187 3188 static int 3189 nvmf_che_activate(struct adapter *sc) 3190 { 3191 struct nvmf_che_adapter *nca; 3192 int rc; 3193 3194 ASSERT_SYNCHRONIZED_OP(sc); 3195 3196 if (uld_active(sc, ULD_NVME)) { 3197 KASSERT(0, ("%s: NVMe offload already enabled on adapter %p", 3198 __func__, sc)); 3199 return (0); 3200 } 3201 3202 if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) { 3203 device_printf(sc->dev, 3204 "not NVMe offload capable, or capability disabled\n"); 3205 return (ENOSYS); 3206 } 3207 3208 /* per-adapter softc for NVMe */ 3209 nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK); 3210 nca->sc = sc; 3211 3212 rc = nvmf_che_init(sc, nca); 3213 if (rc != 0) { 3214 free(nca, M_CXGBE); 3215 return (rc); 3216 } 3217 3218 sc->nvme_ulp_softc = nca; 3219 3220 return (0); 3221 } 3222 3223 static int 3224 nvmf_che_deactivate(struct adapter *sc) 3225 { 3226 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; 3227 3228 ASSERT_SYNCHRONIZED_OP(sc); 3229 3230 if (nca != NULL) { 3231 nvmf_che_destroy(nca); 3232 sc->nvme_ulp_softc = NULL; 3233 } 3234 3235 return (0); 3236 } 3237 3238 static void 3239 nvmf_che_activate_all(struct adapter *sc, void *arg __unused) 3240 { 3241 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0) 3242 return; 3243 3244 /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */ 3245 if (sc->offload_map && !uld_active(sc, ULD_NVME)) 3246 (void) t4_activate_uld(sc, ULD_NVME); 3247 3248 end_synchronized_op(sc, 0); 3249 } 3250 3251 static void 3252 nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused) 3253 { 3254 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0) 3255 return; 3256 3257 if (uld_active(sc, ULD_NVME)) 3258 (void) t4_deactivate_uld(sc, ULD_NVME); 3259 3260 end_synchronized_op(sc, 0); 3261 } 3262 3263 static struct uld_info nvmf_che_uld_info = { 3264 .uld_activate = nvmf_che_activate, 3265 .uld_deactivate = nvmf_che_deactivate, 3266 }; 3267 3268 static int 3269 nvmf_che_mod_load(void) 3270 { 3271 int rc; 3272 3273 t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp); 3274 t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data); 3275 3276 rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME); 3277 if (rc != 0) 3278 return (rc); 3279 3280 t4_iterate(nvmf_che_activate_all, NULL); 3281 3282 return (rc); 3283 } 3284 3285 static int 3286 nvmf_che_mod_unload(void) 3287 { 3288 t4_iterate(nvmf_che_deactivate_all, NULL); 3289 3290 if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY) 3291 return (EBUSY); 3292 3293 t4_register_cpl_handler(CPL_NVMT_CMP, NULL); 3294 t4_register_cpl_handler(CPL_NVMT_DATA, NULL); 3295 3296 return (0); 3297 } 3298 #endif 3299 3300 static int 3301 nvmf_che_modevent(module_t mod, int cmd, void *arg) 3302 { 3303 int rc; 3304 3305 #ifdef TCP_OFFLOAD 3306 switch (cmd) { 3307 case MOD_LOAD: 3308 rc = nvmf_che_mod_load(); 3309 break; 3310 case MOD_UNLOAD: 3311 rc = nvmf_che_mod_unload(); 3312 break; 3313 default: 3314 rc = EOPNOTSUPP; 3315 break; 3316 } 3317 #else 3318 printf("nvmf_che: compiled without TCP_OFFLOAD support.\n"); 3319 rc = EOPNOTSUPP; 3320 #endif 3321 3322 return (rc); 3323 } 3324 3325 static moduledata_t nvmf_che_mod = { 3326 "nvmf_che", 3327 nvmf_che_modevent, 3328 NULL, 3329 }; 3330 3331 MODULE_VERSION(nvmf_che, 1); 3332 DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY); 3333 MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1); 3334 MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1); 3335