1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Chelsio T5xx iSCSI driver 6 * 7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet.h" 35 #include "opt_inet6.h" 36 37 #include <sys/types.h> 38 #include <sys/param.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/module.h> 42 #include <sys/systm.h> 43 44 #ifdef TCP_OFFLOAD 45 #include <sys/errno.h> 46 #include <sys/kthread.h> 47 #include <sys/smp.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/mbuf.h> 51 #include <sys/lock.h> 52 #include <sys/mutex.h> 53 #include <sys/condvar.h> 54 55 #include <netinet/in.h> 56 #include <netinet/in_pcb.h> 57 #include <netinet/toecore.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/tcp_fsm.h> 60 61 #include <cam/scsi/scsi_all.h> 62 #include <cam/scsi/scsi_da.h> 63 #include <cam/ctl/ctl_io.h> 64 #include <cam/ctl/ctl.h> 65 #include <cam/ctl/ctl_backend.h> 66 #include <cam/ctl/ctl_error.h> 67 #include <cam/ctl/ctl_frontend.h> 68 #include <cam/ctl/ctl_debug.h> 69 #include <cam/ctl/ctl_ha.h> 70 #include <cam/ctl/ctl_ioctl.h> 71 72 #include <dev/iscsi/icl.h> 73 #include <dev/iscsi/iscsi_proto.h> 74 #include <dev/iscsi/iscsi_ioctl.h> 75 #include <dev/iscsi/iscsi.h> 76 #include <cam/ctl/ctl_frontend_iscsi.h> 77 78 #include <cam/cam.h> 79 #include <cam/cam_ccb.h> 80 #include <cam/cam_xpt.h> 81 #include <cam/cam_debug.h> 82 #include <cam/cam_sim.h> 83 #include <cam/cam_xpt_sim.h> 84 #include <cam/cam_xpt_periph.h> 85 #include <cam/cam_periph.h> 86 #include <cam/cam_compat.h> 87 #include <cam/scsi/scsi_message.h> 88 89 #include "common/common.h" 90 #include "common/t4_msg.h" 91 #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ 92 #include "tom/t4_tom.h" 93 #include "cxgbei.h" 94 95 static int worker_thread_count; 96 static struct cxgbei_worker_thread_softc *cwt_softc; 97 static struct proc *cxgbei_proc; 98 99 static void 100 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len, 101 uint32_t *max_rx_data_len, struct ppod_region *pr) 102 { 103 uint32_t tx_len, rx_len, r, v; 104 105 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); 106 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); 107 108 r = t4_read_reg(sc, A_TP_PARA_REG2); 109 rx_len = min(rx_len, G_MAXRXDATA(r)); 110 tx_len = min(tx_len, G_MAXRXDATA(r)); 111 112 r = t4_read_reg(sc, A_TP_PARA_REG7); 113 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); 114 rx_len = min(rx_len, v); 115 tx_len = min(tx_len, v); 116 117 /* 118 * AHS is not supported by the kernel so we'll not account for 119 * it either in our PDU len -> data segment len conversions. 120 */ 121 rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + 122 ISCSI_DATA_DIGEST_SIZE; 123 tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + 124 ISCSI_DATA_DIGEST_SIZE; 125 126 /* 127 * DDP can place only 4 pages for a single PDU. A single 128 * request might use larger pages than the smallest page size, 129 * but that cannot be guaranteed. Assume the smallest DDP 130 * page size for this limit. 131 */ 132 rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0])); 133 134 if (chip_id(sc) == CHELSIO_T5) { 135 tx_len = min(tx_len, 15360); 136 137 rx_len = rounddown2(rx_len, 512); 138 tx_len = rounddown2(tx_len, 512); 139 } 140 141 *max_tx_data_len = tx_len; 142 *max_rx_data_len = rx_len; 143 } 144 145 /* 146 * Initialize the software state of the iSCSI ULP driver. 147 * 148 * ENXIO means firmware didn't set up something that it was supposed to. 149 */ 150 static int 151 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) 152 { 153 struct sysctl_oid *oid; 154 struct sysctl_oid_list *children; 155 struct ppod_region *pr; 156 uint32_t r; 157 int rc; 158 159 MPASS(sc->vres.iscsi.size > 0); 160 MPASS(ci != NULL); 161 162 pr = &ci->pr; 163 r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); 164 rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); 165 if (rc != 0) { 166 device_printf(sc->dev, 167 "%s: failed to initialize the iSCSI page pod region: %u.\n", 168 __func__, rc); 169 return (rc); 170 } 171 172 r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); 173 r &= V_ISCSITAGMASK(M_ISCSITAGMASK); 174 if (r != pr->pr_tag_mask) { 175 /* 176 * Recent firmwares are supposed to set up the iSCSI tagmask 177 * but we'll do it ourselves it the computed value doesn't match 178 * what's in the register. 179 */ 180 device_printf(sc->dev, 181 "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, 182 pr->pr_tag_mask); 183 t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, 184 V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); 185 } 186 187 read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr); 188 189 sysctl_ctx_init(&ci->ctx); 190 oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ 191 children = SYSCTL_CHILDREN(oid); 192 193 oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", 194 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings"); 195 children = SYSCTL_CHILDREN(oid); 196 197 ci->ddp_threshold = 2048; 198 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", 199 CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); 200 201 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len", 202 CTLFLAG_RD, &ci->max_rx_data_len, 0, 203 "Maximum receive data segment length"); 204 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len", 205 CTLFLAG_RD, &ci->max_tx_data_len, 0, 206 "Maximum transmit data segment length"); 207 208 return (0); 209 } 210 211 static int 212 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 213 { 214 struct adapter *sc = iq->adapter; 215 struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); 216 u_int tid = GET_TID(cpl); 217 struct toepcb *toep = lookup_tid(sc, tid); 218 struct icl_pdu *ip; 219 struct icl_cxgbei_pdu *icp; 220 uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); 221 uint16_t len = be16toh(cpl->len); 222 223 M_ASSERTPKTHDR(m); 224 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 225 226 ip = icl_cxgbei_new_pdu(M_NOWAIT); 227 if (ip == NULL) 228 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 229 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 230 ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; 231 icp = ip_to_icp(ip); 232 icp->icp_seq = ntohl(cpl->seq); 233 icp->icp_flags = ICPF_RX_HDR; 234 235 /* This is the start of a new PDU. There should be no old state. */ 236 MPASS(toep->ulpcb2 == NULL); 237 toep->ulpcb2 = icp; 238 239 #if 0 240 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", 241 __func__, tid, len, len_ddp, icp); 242 #endif 243 244 m_freem(m); 245 return (0); 246 } 247 248 static int 249 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 250 { 251 struct adapter *sc = iq->adapter; 252 struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); 253 u_int tid = GET_TID(cpl); 254 struct toepcb *toep = lookup_tid(sc, tid); 255 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 256 struct icl_pdu *ip; 257 258 M_ASSERTPKTHDR(m); 259 MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); 260 261 if (icp == NULL) { 262 /* 263 * T6 completion enabled, start of a new pdu. Header 264 * will come in completion CPL. 265 */ 266 ip = icl_cxgbei_new_pdu(M_NOWAIT); 267 if (ip == NULL) 268 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 269 icp = ip_to_icp(ip); 270 } else { 271 /* T5 mode, header is already received. */ 272 MPASS(icp->icp_flags == ICPF_RX_HDR); 273 MPASS(icp->ip.ip_data_mbuf == NULL); 274 MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl)); 275 } 276 277 /* Trim the cpl header from mbuf. */ 278 m_adj(m, sizeof(*cpl)); 279 280 icp->icp_flags |= ICPF_RX_FLBUF; 281 icp->ip.ip_data_mbuf = m; 282 toep->ofld_rxq->rx_iscsi_fl_pdus++; 283 toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len; 284 285 /* 286 * For T6, save the icp for further processing in the 287 * completion handler. 288 */ 289 if (icp->icp_flags == ICPF_RX_FLBUF) { 290 MPASS(toep->ulpcb2 == NULL); 291 toep->ulpcb2 = icp; 292 } 293 294 #if 0 295 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid, 296 be16toh(cpl->len), icp); 297 #endif 298 299 return (0); 300 } 301 302 static int 303 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 304 { 305 struct adapter *sc = iq->adapter; 306 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 307 u_int tid = GET_TID(cpl); 308 struct toepcb *toep = lookup_tid(sc, tid); 309 struct inpcb *inp = toep->inp; 310 struct socket *so; 311 struct sockbuf *sb; 312 struct tcpcb *tp; 313 struct icl_cxgbei_conn *icc; 314 struct icl_conn *ic; 315 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 316 struct icl_pdu *ip; 317 u_int pdu_len, val; 318 struct epoch_tracker et; 319 320 MPASS(m == NULL); 321 322 /* Must already be assembling a PDU. */ 323 MPASS(icp != NULL); 324 MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ 325 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 326 327 pdu_len = be16toh(cpl->len); /* includes everything. */ 328 val = be32toh(cpl->ddpvld); 329 330 #if 0 331 CTR5(KTR_CXGBE, 332 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", 333 __func__, tid, pdu_len, val, icp->icp_flags); 334 #endif 335 336 icp->icp_flags |= ICPF_RX_STATUS; 337 ip = &icp->ip; 338 if (val & F_DDP_PADDING_ERR) 339 icp->icp_flags |= ICPF_PAD_ERR; 340 if (val & F_DDP_HDRCRC_ERR) 341 icp->icp_flags |= ICPF_HCRC_ERR; 342 if (val & F_DDP_DATACRC_ERR) 343 icp->icp_flags |= ICPF_DCRC_ERR; 344 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 345 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 346 MPASS(ip->ip_data_len > 0); 347 icp->icp_flags |= ICPF_RX_DDP; 348 toep->ofld_rxq->rx_iscsi_ddp_pdus++; 349 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; 350 } 351 352 INP_WLOCK(inp); 353 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 354 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 355 __func__, tid, pdu_len, inp->inp_flags); 356 INP_WUNLOCK(inp); 357 icl_cxgbei_conn_pdu_free(NULL, ip); 358 toep->ulpcb2 = NULL; 359 return (0); 360 } 361 362 /* 363 * T6+ does not report data PDUs received via DDP without F 364 * set. This can result in gaps in the TCP sequence space. 365 */ 366 tp = intotcpcb(inp); 367 MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt); 368 tp->rcv_nxt = icp->icp_seq + pdu_len; 369 tp->t_rcvtime = ticks; 370 371 /* 372 * Don't update the window size or return credits since RX 373 * flow control is disabled. 374 */ 375 376 so = inp->inp_socket; 377 sb = &so->so_rcv; 378 SOCKBUF_LOCK(sb); 379 380 icc = toep->ulpcb; 381 if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { 382 CTR5(KTR_CXGBE, 383 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 384 __func__, tid, pdu_len, icc, sb->sb_state); 385 SOCKBUF_UNLOCK(sb); 386 INP_WUNLOCK(inp); 387 388 CURVNET_SET(so->so_vnet); 389 NET_EPOCH_ENTER(et); 390 INP_WLOCK(inp); 391 tp = tcp_drop(tp, ECONNRESET); 392 if (tp) 393 INP_WUNLOCK(inp); 394 NET_EPOCH_EXIT(et); 395 CURVNET_RESTORE(); 396 397 icl_cxgbei_conn_pdu_free(NULL, ip); 398 toep->ulpcb2 = NULL; 399 return (0); 400 } 401 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 402 ic = &icc->ic; 403 icl_cxgbei_new_pdu_set_conn(ip, ic); 404 405 MPASS(m == NULL); /* was unused, we'll use it now. */ 406 m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */ 407 if (__predict_false(m != NULL)) { 408 int len = m_length(m, NULL); 409 410 /* 411 * PDUs were received before the tid transitioned to ULP mode. 412 * Convert them to icl_cxgbei_pdus and send them to ICL before 413 * the PDU in icp/ip. 414 */ 415 CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid, 416 len); 417 418 /* XXXNP: needs to be rewritten. */ 419 if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct 420 iscsi_bhs)) { 421 struct icl_cxgbei_pdu *icp0; 422 struct icl_pdu *ip0; 423 424 ip0 = icl_cxgbei_new_pdu(M_NOWAIT); 425 if (ip0 == NULL) 426 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 427 icl_cxgbei_new_pdu_set_conn(ip0, ic); 428 icp0 = ip_to_icp(ip0); 429 icp0->icp_seq = 0; /* XXX */ 430 icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS; 431 m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs); 432 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next); 433 } 434 m_freem(m); 435 } 436 437 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 438 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 439 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 440 441 mtx_lock(&cwt->cwt_lock); 442 icc->rx_flags |= RXF_ACTIVE; 443 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 444 if (cwt->cwt_state == CWT_SLEEPING) { 445 cwt->cwt_state = CWT_RUNNING; 446 cv_signal(&cwt->cwt_cv); 447 } 448 mtx_unlock(&cwt->cwt_lock); 449 } 450 SOCKBUF_UNLOCK(sb); 451 INP_WUNLOCK(inp); 452 453 toep->ulpcb2 = NULL; 454 455 return (0); 456 } 457 458 static int 459 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 460 { 461 struct epoch_tracker et; 462 struct adapter *sc = iq->adapter; 463 struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *); 464 u_int tid = GET_TID(cpl); 465 struct toepcb *toep = lookup_tid(sc, tid); 466 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 467 struct icl_pdu *ip; 468 struct cxgbei_cmp *cmp; 469 struct inpcb *inp = toep->inp; 470 #ifdef INVARIANTS 471 uint16_t len = be16toh(cpl->len); 472 #endif 473 struct socket *so; 474 struct sockbuf *sb; 475 struct tcpcb *tp; 476 struct icl_cxgbei_conn *icc; 477 struct icl_conn *ic; 478 struct iscsi_bhs_data_out *bhsdo; 479 u_int val = be32toh(cpl->ddpvld); 480 u_int npdus, pdu_len, data_digest_len, hdr_digest_len; 481 uint32_t prev_seg_len; 482 483 M_ASSERTPKTHDR(m); 484 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 485 486 if ((val & F_DDP_PDU) == 0) { 487 MPASS(icp != NULL); 488 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 489 ip = &icp->ip; 490 } 491 492 if (icp == NULL) { 493 /* T6 completion enabled, start of a new PDU. */ 494 ip = icl_cxgbei_new_pdu(M_NOWAIT); 495 if (ip == NULL) 496 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 497 icp = ip_to_icp(ip); 498 } 499 pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp)); 500 501 #if 0 502 CTR5(KTR_CXGBE, 503 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p", 504 __func__, tid, pdu_len, val, icp); 505 #endif 506 507 /* Copy header */ 508 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 509 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; 510 ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 | 511 bhsdo->bhsdo_data_segment_len[1] << 8 | 512 bhsdo->bhsdo_data_segment_len[2]; 513 icp->icp_seq = ntohl(cpl->seq); 514 icp->icp_flags |= ICPF_RX_HDR; 515 icp->icp_flags |= ICPF_RX_STATUS; 516 517 if (val & F_DDP_PADDING_ERR) 518 icp->icp_flags |= ICPF_PAD_ERR; 519 if (val & F_DDP_HDRCRC_ERR) 520 icp->icp_flags |= ICPF_HCRC_ERR; 521 if (val & F_DDP_DATACRC_ERR) 522 icp->icp_flags |= ICPF_DCRC_ERR; 523 524 INP_WLOCK(inp); 525 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 526 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 527 __func__, tid, pdu_len, inp->inp_flags); 528 INP_WUNLOCK(inp); 529 icl_cxgbei_conn_pdu_free(NULL, ip); 530 toep->ulpcb2 = NULL; 531 m_freem(m); 532 return (0); 533 } 534 535 tp = intotcpcb(inp); 536 537 /* 538 * If icc is NULL, the connection is being closed in 539 * icl_cxgbei_conn_close(), just drop this data. 540 */ 541 icc = toep->ulpcb; 542 if (__predict_false(icc == NULL)) { 543 CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p", 544 __func__, tid, pdu_len, icc); 545 546 /* 547 * Update rcv_nxt so the sequence number of the FIN 548 * doesn't appear wrong. 549 */ 550 tp->rcv_nxt = icp->icp_seq + pdu_len; 551 tp->t_rcvtime = ticks; 552 INP_WUNLOCK(inp); 553 554 icl_cxgbei_conn_pdu_free(NULL, ip); 555 toep->ulpcb2 = NULL; 556 m_freem(m); 557 return (0); 558 } 559 560 data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ? 561 ISCSI_DATA_DIGEST_SIZE : 0; 562 hdr_digest_len = (icc->ulp_submode & ULP_CRC_HEADER) ? 563 ISCSI_HEADER_DIGEST_SIZE : 0; 564 MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len); 565 566 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 567 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 568 MPASS(ip->ip_data_len > 0); 569 icp->icp_flags |= ICPF_RX_DDP; 570 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; 571 572 switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) { 573 case ISCSI_BHS_OPCODE_SCSI_DATA_IN: 574 cmp = cxgbei_find_cmp(icc, 575 be32toh(bhsdo->bhsdo_initiator_task_tag)); 576 break; 577 case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: 578 cmp = cxgbei_find_cmp(icc, 579 be32toh(bhsdo->bhsdo_target_transfer_tag)); 580 break; 581 default: 582 __assert_unreachable(); 583 } 584 MPASS(cmp != NULL); 585 586 /* 587 * The difference between the end of the last burst 588 * and the offset of the last PDU in this burst is 589 * the additional data received via DDP. 590 */ 591 prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) - 592 cmp->next_buffer_offset; 593 594 if (prev_seg_len != 0) { 595 uint32_t orig_datasn; 596 597 /* 598 * Return a "large" PDU representing the burst 599 * of PDUs. Adjust the offset and length of 600 * this PDU to represent the entire burst. 601 */ 602 ip->ip_data_len += prev_seg_len; 603 bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len; 604 bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8; 605 bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16; 606 bhsdo->bhsdo_buffer_offset = 607 htobe32(cmp->next_buffer_offset); 608 609 orig_datasn = htobe32(bhsdo->bhsdo_datasn); 610 npdus = orig_datasn - cmp->last_datasn; 611 bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1); 612 cmp->last_datasn = orig_datasn; 613 ip->ip_additional_pdus = npdus - 1; 614 } else { 615 MPASS(htobe32(bhsdo->bhsdo_datasn) == 616 cmp->last_datasn + 1); 617 npdus = 1; 618 cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn); 619 } 620 621 cmp->next_buffer_offset += ip->ip_data_len; 622 toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus; 623 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; 624 } else { 625 MPASS(icp->icp_flags & (ICPF_RX_FLBUF)); 626 MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len); 627 } 628 629 tp->rcv_nxt = icp->icp_seq + pdu_len; 630 tp->t_rcvtime = ticks; 631 632 /* 633 * Don't update the window size or return credits since RX 634 * flow control is disabled. 635 */ 636 637 so = inp->inp_socket; 638 sb = &so->so_rcv; 639 SOCKBUF_LOCK(sb); 640 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 641 CTR5(KTR_CXGBE, 642 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 643 __func__, tid, pdu_len, icc, sb->sb_state); 644 SOCKBUF_UNLOCK(sb); 645 INP_WUNLOCK(inp); 646 647 CURVNET_SET(so->so_vnet); 648 NET_EPOCH_ENTER(et); 649 INP_WLOCK(inp); 650 tp = tcp_drop(tp, ECONNRESET); 651 if (tp != NULL) 652 INP_WUNLOCK(inp); 653 NET_EPOCH_EXIT(et); 654 CURVNET_RESTORE(); 655 656 icl_cxgbei_conn_pdu_free(NULL, ip); 657 toep->ulpcb2 = NULL; 658 m_freem(m); 659 return (0); 660 } 661 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 662 ic = &icc->ic; 663 icl_cxgbei_new_pdu_set_conn(ip, ic); 664 665 /* Enqueue the PDU to the received pdus queue. */ 666 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 667 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 668 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 669 670 mtx_lock(&cwt->cwt_lock); 671 icc->rx_flags |= RXF_ACTIVE; 672 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 673 if (cwt->cwt_state == CWT_SLEEPING) { 674 cwt->cwt_state = CWT_RUNNING; 675 cv_signal(&cwt->cwt_cv); 676 } 677 mtx_unlock(&cwt->cwt_lock); 678 } 679 SOCKBUF_UNLOCK(sb); 680 INP_WUNLOCK(inp); 681 682 toep->ulpcb2 = NULL; 683 m_freem(m); 684 685 return (0); 686 } 687 688 static int 689 cxgbei_activate(struct adapter *sc) 690 { 691 struct cxgbei_data *ci; 692 int rc; 693 694 ASSERT_SYNCHRONIZED_OP(sc); 695 696 if (uld_active(sc, ULD_ISCSI)) { 697 KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", 698 __func__, sc)); 699 return (0); 700 } 701 702 if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { 703 device_printf(sc->dev, 704 "not iSCSI offload capable, or capability disabled.\n"); 705 return (ENOSYS); 706 } 707 708 /* per-adapter softc for iSCSI */ 709 ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); 710 if (ci == NULL) 711 return (ENOMEM); 712 713 rc = cxgbei_init(sc, ci); 714 if (rc != 0) { 715 free(ci, M_CXGBE); 716 return (rc); 717 } 718 719 sc->iscsi_ulp_softc = ci; 720 721 return (0); 722 } 723 724 static int 725 cxgbei_deactivate(struct adapter *sc) 726 { 727 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 728 729 ASSERT_SYNCHRONIZED_OP(sc); 730 731 if (ci != NULL) { 732 sysctl_ctx_free(&ci->ctx); 733 t4_free_ppod_region(&ci->pr); 734 free(ci, M_CXGBE); 735 sc->iscsi_ulp_softc = NULL; 736 } 737 738 return (0); 739 } 740 741 static void 742 cxgbei_activate_all(struct adapter *sc, void *arg __unused) 743 { 744 745 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) 746 return; 747 748 /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ 749 if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) 750 (void) t4_activate_uld(sc, ULD_ISCSI); 751 752 end_synchronized_op(sc, 0); 753 } 754 755 static void 756 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) 757 { 758 759 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) 760 return; 761 762 if (uld_active(sc, ULD_ISCSI)) 763 (void) t4_deactivate_uld(sc, ULD_ISCSI); 764 765 end_synchronized_op(sc, 0); 766 } 767 768 static struct uld_info cxgbei_uld_info = { 769 .uld_id = ULD_ISCSI, 770 .activate = cxgbei_activate, 771 .deactivate = cxgbei_deactivate, 772 }; 773 774 static void 775 cwt_main(void *arg) 776 { 777 struct cxgbei_worker_thread_softc *cwt = arg; 778 struct icl_cxgbei_conn *icc = NULL; 779 struct icl_conn *ic; 780 struct icl_pdu *ip; 781 struct sockbuf *sb; 782 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); 783 784 MPASS(cwt != NULL); 785 786 mtx_lock(&cwt->cwt_lock); 787 MPASS(cwt->cwt_state == 0); 788 cwt->cwt_state = CWT_RUNNING; 789 cv_signal(&cwt->cwt_cv); 790 791 while (__predict_true(cwt->cwt_state != CWT_STOP)) { 792 cwt->cwt_state = CWT_RUNNING; 793 while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { 794 TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); 795 mtx_unlock(&cwt->cwt_lock); 796 797 ic = &icc->ic; 798 sb = &ic->ic_socket->so_rcv; 799 800 SOCKBUF_LOCK(sb); 801 MPASS(icc->rx_flags & RXF_ACTIVE); 802 if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { 803 MPASS(STAILQ_EMPTY(&rx_pdus)); 804 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); 805 SOCKBUF_UNLOCK(sb); 806 807 /* Hand over PDUs to ICL. */ 808 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { 809 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); 810 ic->ic_receive(ip); 811 } 812 813 SOCKBUF_LOCK(sb); 814 MPASS(STAILQ_EMPTY(&rx_pdus)); 815 } 816 MPASS(icc->rx_flags & RXF_ACTIVE); 817 if (STAILQ_EMPTY(&icc->rcvd_pdus) || 818 __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 819 icc->rx_flags &= ~RXF_ACTIVE; 820 } else { 821 /* 822 * More PDUs were received while we were busy 823 * handing over the previous batch to ICL. 824 * Re-add this connection to the end of the 825 * queue. 826 */ 827 mtx_lock(&cwt->cwt_lock); 828 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, 829 rx_link); 830 mtx_unlock(&cwt->cwt_lock); 831 } 832 SOCKBUF_UNLOCK(sb); 833 834 mtx_lock(&cwt->cwt_lock); 835 } 836 837 /* Inner loop doesn't check for CWT_STOP, do that first. */ 838 if (__predict_false(cwt->cwt_state == CWT_STOP)) 839 break; 840 cwt->cwt_state = CWT_SLEEPING; 841 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 842 } 843 844 MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); 845 mtx_assert(&cwt->cwt_lock, MA_OWNED); 846 cwt->cwt_state = CWT_STOPPED; 847 cv_signal(&cwt->cwt_cv); 848 mtx_unlock(&cwt->cwt_lock); 849 kthread_exit(); 850 } 851 852 static int 853 start_worker_threads(void) 854 { 855 int i, rc; 856 struct cxgbei_worker_thread_softc *cwt; 857 858 worker_thread_count = min(mp_ncpus, 32); 859 cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, 860 M_WAITOK | M_ZERO); 861 862 MPASS(cxgbei_proc == NULL); 863 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 864 mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); 865 cv_init(&cwt->cwt_cv, "cwt cv"); 866 TAILQ_INIT(&cwt->rx_head); 867 rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, 868 "cxgbei", "%d", i); 869 if (rc != 0) { 870 printf("cxgbei: failed to start thread #%d/%d (%d)\n", 871 i + 1, worker_thread_count, rc); 872 mtx_destroy(&cwt->cwt_lock); 873 cv_destroy(&cwt->cwt_cv); 874 bzero(cwt, sizeof(*cwt)); 875 if (i == 0) { 876 free(cwt_softc, M_CXGBE); 877 worker_thread_count = 0; 878 879 return (rc); 880 } 881 882 /* Not fatal, carry on with fewer threads. */ 883 worker_thread_count = i; 884 rc = 0; 885 break; 886 } 887 888 /* Wait for thread to start before moving on to the next one. */ 889 mtx_lock(&cwt->cwt_lock); 890 while (cwt->cwt_state == 0) 891 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 892 mtx_unlock(&cwt->cwt_lock); 893 } 894 895 MPASS(cwt_softc != NULL); 896 MPASS(worker_thread_count > 0); 897 return (0); 898 } 899 900 static void 901 stop_worker_threads(void) 902 { 903 int i; 904 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; 905 906 MPASS(worker_thread_count >= 0); 907 908 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 909 mtx_lock(&cwt->cwt_lock); 910 MPASS(cwt->cwt_state == CWT_RUNNING || 911 cwt->cwt_state == CWT_SLEEPING); 912 cwt->cwt_state = CWT_STOP; 913 cv_signal(&cwt->cwt_cv); 914 do { 915 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 916 } while (cwt->cwt_state != CWT_STOPPED); 917 mtx_unlock(&cwt->cwt_lock); 918 mtx_destroy(&cwt->cwt_lock); 919 cv_destroy(&cwt->cwt_cv); 920 } 921 free(cwt_softc, M_CXGBE); 922 } 923 924 /* Select a worker thread for a connection. */ 925 u_int 926 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc) 927 { 928 struct adapter *sc = icc->sc; 929 struct toepcb *toep = icc->toep; 930 u_int i, n; 931 932 n = worker_thread_count / sc->sge.nofldrxq; 933 if (n > 0) 934 i = toep->vi->pi->port_id * n + arc4random() % n; 935 else 936 i = arc4random() % worker_thread_count; 937 938 CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i); 939 940 return (i); 941 } 942 943 static int 944 cxgbei_mod_load(void) 945 { 946 int rc; 947 948 t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr); 949 t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data); 950 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); 951 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp); 952 953 rc = start_worker_threads(); 954 if (rc != 0) 955 return (rc); 956 957 rc = t4_register_uld(&cxgbei_uld_info); 958 if (rc != 0) { 959 stop_worker_threads(); 960 return (rc); 961 } 962 963 t4_iterate(cxgbei_activate_all, NULL); 964 965 return (rc); 966 } 967 968 static int 969 cxgbei_mod_unload(void) 970 { 971 972 t4_iterate(cxgbei_deactivate_all, NULL); 973 974 if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) 975 return (EBUSY); 976 977 stop_worker_threads(); 978 979 t4_register_cpl_handler(CPL_ISCSI_HDR, NULL); 980 t4_register_cpl_handler(CPL_ISCSI_DATA, NULL); 981 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL); 982 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL); 983 984 return (0); 985 } 986 #endif 987 988 static int 989 cxgbei_modevent(module_t mod, int cmd, void *arg) 990 { 991 int rc = 0; 992 993 #ifdef TCP_OFFLOAD 994 switch (cmd) { 995 case MOD_LOAD: 996 rc = cxgbei_mod_load(); 997 if (rc == 0) 998 rc = icl_cxgbei_mod_load(); 999 break; 1000 1001 case MOD_UNLOAD: 1002 rc = icl_cxgbei_mod_unload(); 1003 if (rc == 0) 1004 rc = cxgbei_mod_unload(); 1005 break; 1006 1007 default: 1008 rc = EINVAL; 1009 } 1010 #else 1011 printf("cxgbei: compiled without TCP_OFFLOAD support.\n"); 1012 rc = EOPNOTSUPP; 1013 #endif 1014 1015 return (rc); 1016 } 1017 1018 static moduledata_t cxgbei_mod = { 1019 "cxgbei", 1020 cxgbei_modevent, 1021 NULL, 1022 }; 1023 1024 MODULE_VERSION(cxgbei, 1); 1025 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1026 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); 1027 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); 1028 MODULE_DEPEND(cxgbei, icl, 1, 1, 1); 1029