1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Chelsio T5xx iSCSI driver 6 * 7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet.h" 35 #include "opt_inet6.h" 36 37 #include <sys/types.h> 38 #include <sys/param.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/module.h> 42 #include <sys/systm.h> 43 44 #ifdef TCP_OFFLOAD 45 #include <sys/errno.h> 46 #include <sys/kthread.h> 47 #include <sys/smp.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/mbuf.h> 51 #include <sys/lock.h> 52 #include <sys/mutex.h> 53 #include <sys/condvar.h> 54 55 #include <netinet/in.h> 56 #include <netinet/in_pcb.h> 57 #include <netinet/toecore.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/tcp_fsm.h> 60 61 #include <cam/scsi/scsi_all.h> 62 #include <cam/scsi/scsi_da.h> 63 #include <cam/ctl/ctl_io.h> 64 #include <cam/ctl/ctl.h> 65 #include <cam/ctl/ctl_backend.h> 66 #include <cam/ctl/ctl_error.h> 67 #include <cam/ctl/ctl_frontend.h> 68 #include <cam/ctl/ctl_debug.h> 69 #include <cam/ctl/ctl_ha.h> 70 #include <cam/ctl/ctl_ioctl.h> 71 72 #include <dev/iscsi/icl.h> 73 #include <dev/iscsi/iscsi_proto.h> 74 #include <dev/iscsi/iscsi_ioctl.h> 75 #include <dev/iscsi/iscsi.h> 76 #include <cam/ctl/ctl_frontend_iscsi.h> 77 78 #include <cam/cam.h> 79 #include <cam/cam_ccb.h> 80 #include <cam/cam_xpt.h> 81 #include <cam/cam_debug.h> 82 #include <cam/cam_sim.h> 83 #include <cam/cam_xpt_sim.h> 84 #include <cam/cam_xpt_periph.h> 85 #include <cam/cam_periph.h> 86 #include <cam/cam_compat.h> 87 #include <cam/scsi/scsi_message.h> 88 89 #include "common/common.h" 90 #include "common/t4_msg.h" 91 #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ 92 #include "tom/t4_tom.h" 93 #include "cxgbei.h" 94 95 static int worker_thread_count; 96 static struct cxgbei_worker_thread_softc *cwt_softc; 97 static struct proc *cxgbei_proc; 98 99 static void 100 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len, 101 uint32_t *max_rx_data_len, struct ppod_region *pr) 102 { 103 uint32_t tx_len, rx_len, r, v; 104 105 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); 106 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); 107 108 r = t4_read_reg(sc, A_TP_PARA_REG2); 109 rx_len = min(rx_len, G_MAXRXDATA(r)); 110 tx_len = min(tx_len, G_MAXRXDATA(r)); 111 112 r = t4_read_reg(sc, A_TP_PARA_REG7); 113 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); 114 rx_len = min(rx_len, v); 115 tx_len = min(tx_len, v); 116 117 /* 118 * AHS is not supported by the kernel so we'll not account for 119 * it either in our PDU len -> data segment len conversions. 120 */ 121 rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + 122 ISCSI_DATA_DIGEST_SIZE; 123 tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + 124 ISCSI_DATA_DIGEST_SIZE; 125 126 /* 127 * DDP can place only 4 pages for a single PDU. A single 128 * request might use larger pages than the smallest page size, 129 * but that cannot be guaranteed. Assume the smallest DDP 130 * page size for this limit. 131 */ 132 rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0])); 133 134 if (chip_id(sc) == CHELSIO_T5) { 135 tx_len = min(tx_len, 15360); 136 137 rx_len = rounddown2(rx_len, 512); 138 tx_len = rounddown2(tx_len, 512); 139 } 140 141 *max_tx_data_len = tx_len; 142 *max_rx_data_len = rx_len; 143 } 144 145 /* 146 * Initialize the software state of the iSCSI ULP driver. 147 * 148 * ENXIO means firmware didn't set up something that it was supposed to. 149 */ 150 static int 151 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) 152 { 153 struct sysctl_oid *oid; 154 struct sysctl_oid_list *children; 155 struct ppod_region *pr; 156 uint32_t r; 157 int rc; 158 159 MPASS(sc->vres.iscsi.size > 0); 160 MPASS(ci != NULL); 161 162 pr = &ci->pr; 163 r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); 164 rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); 165 if (rc != 0) { 166 device_printf(sc->dev, 167 "%s: failed to initialize the iSCSI page pod region: %u.\n", 168 __func__, rc); 169 return (rc); 170 } 171 172 r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); 173 r &= V_ISCSITAGMASK(M_ISCSITAGMASK); 174 if (r != pr->pr_tag_mask) { 175 /* 176 * Recent firmwares are supposed to set up the iSCSI tagmask 177 * but we'll do it ourselves it the computed value doesn't match 178 * what's in the register. 179 */ 180 device_printf(sc->dev, 181 "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, 182 pr->pr_tag_mask); 183 t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, 184 V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); 185 } 186 187 read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr); 188 189 sysctl_ctx_init(&ci->ctx); 190 oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ 191 children = SYSCTL_CHILDREN(oid); 192 193 oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", 194 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings"); 195 children = SYSCTL_CHILDREN(oid); 196 197 ci->ddp_threshold = 2048; 198 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", 199 CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); 200 201 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len", 202 CTLFLAG_RD, &ci->max_rx_data_len, 0, 203 "Maximum receive data segment length"); 204 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len", 205 CTLFLAG_RD, &ci->max_tx_data_len, 0, 206 "Maximum transmit data segment length"); 207 208 return (0); 209 } 210 211 static int 212 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 213 { 214 struct adapter *sc = iq->adapter; 215 struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); 216 u_int tid = GET_TID(cpl); 217 struct toepcb *toep = lookup_tid(sc, tid); 218 struct icl_pdu *ip; 219 struct icl_cxgbei_pdu *icp; 220 uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); 221 uint16_t len = be16toh(cpl->len); 222 223 M_ASSERTPKTHDR(m); 224 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 225 226 ip = icl_cxgbei_new_pdu(M_NOWAIT); 227 if (ip == NULL) 228 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 229 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 230 ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; 231 icp = ip_to_icp(ip); 232 icp->icp_seq = ntohl(cpl->seq); 233 icp->icp_flags = ICPF_RX_HDR; 234 235 /* This is the start of a new PDU. There should be no old state. */ 236 MPASS(toep->ulpcb2 == NULL); 237 toep->ulpcb2 = icp; 238 239 #if 0 240 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", 241 __func__, tid, len, len_ddp, icp); 242 #endif 243 244 m_freem(m); 245 return (0); 246 } 247 248 static int 249 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 250 { 251 struct adapter *sc = iq->adapter; 252 struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); 253 u_int tid = GET_TID(cpl); 254 struct toepcb *toep = lookup_tid(sc, tid); 255 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 256 struct icl_pdu *ip; 257 258 M_ASSERTPKTHDR(m); 259 MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); 260 261 if (icp == NULL) { 262 /* 263 * T6 completion enabled, start of a new pdu. Header 264 * will come in completion CPL. 265 */ 266 ip = icl_cxgbei_new_pdu(M_NOWAIT); 267 if (ip == NULL) 268 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 269 icp = ip_to_icp(ip); 270 } else { 271 /* T5 mode, header is already received. */ 272 MPASS(icp->icp_flags == ICPF_RX_HDR); 273 MPASS(icp->ip.ip_data_mbuf == NULL); 274 MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl)); 275 } 276 277 /* Trim the cpl header from mbuf. */ 278 m_adj(m, sizeof(*cpl)); 279 280 icp->icp_flags |= ICPF_RX_FLBUF; 281 icp->ip.ip_data_mbuf = m; 282 toep->ofld_rxq->rx_iscsi_fl_pdus++; 283 toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len; 284 285 /* 286 * For T6, save the icp for further processing in the 287 * completion handler. 288 */ 289 if (icp->icp_flags == ICPF_RX_FLBUF) { 290 MPASS(toep->ulpcb2 == NULL); 291 toep->ulpcb2 = icp; 292 } 293 294 #if 0 295 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid, 296 be16toh(cpl->len), icp); 297 #endif 298 299 return (0); 300 } 301 302 static int 303 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 304 { 305 struct adapter *sc = iq->adapter; 306 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 307 u_int tid = GET_TID(cpl); 308 struct toepcb *toep = lookup_tid(sc, tid); 309 struct inpcb *inp = toep->inp; 310 struct socket *so; 311 struct sockbuf *sb; 312 struct tcpcb *tp; 313 struct icl_cxgbei_conn *icc; 314 struct icl_conn *ic; 315 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 316 struct icl_pdu *ip; 317 u_int pdu_len, val; 318 struct epoch_tracker et; 319 320 MPASS(m == NULL); 321 322 /* Must already be assembling a PDU. */ 323 MPASS(icp != NULL); 324 MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ 325 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 326 327 pdu_len = be16toh(cpl->len); /* includes everything. */ 328 val = be32toh(cpl->ddpvld); 329 330 #if 0 331 CTR5(KTR_CXGBE, 332 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", 333 __func__, tid, pdu_len, val, icp->icp_flags); 334 #endif 335 336 icp->icp_flags |= ICPF_RX_STATUS; 337 ip = &icp->ip; 338 if (val & F_DDP_PADDING_ERR) { 339 ICL_WARN("received PDU 0x%02x with invalid padding", 340 ip->ip_bhs->bhs_opcode); 341 toep->ofld_rxq->rx_iscsi_padding_errors++; 342 } 343 if (val & F_DDP_HDRCRC_ERR) { 344 ICL_WARN("received PDU 0x%02x with invalid header digest", 345 ip->ip_bhs->bhs_opcode); 346 toep->ofld_rxq->rx_iscsi_header_digest_errors++; 347 } 348 if (val & F_DDP_DATACRC_ERR) { 349 ICL_WARN("received PDU 0x%02x with invalid data digest", 350 ip->ip_bhs->bhs_opcode); 351 toep->ofld_rxq->rx_iscsi_data_digest_errors++; 352 } 353 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 354 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 355 MPASS(ip->ip_data_len > 0); 356 icp->icp_flags |= ICPF_RX_DDP; 357 toep->ofld_rxq->rx_iscsi_ddp_pdus++; 358 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; 359 } 360 361 INP_WLOCK(inp); 362 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 363 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 364 __func__, tid, pdu_len, inp->inp_flags); 365 INP_WUNLOCK(inp); 366 icl_cxgbei_conn_pdu_free(NULL, ip); 367 toep->ulpcb2 = NULL; 368 return (0); 369 } 370 371 /* 372 * T6+ does not report data PDUs received via DDP without F 373 * set. This can result in gaps in the TCP sequence space. 374 */ 375 tp = intotcpcb(inp); 376 MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt); 377 tp->rcv_nxt = icp->icp_seq + pdu_len; 378 tp->t_rcvtime = ticks; 379 380 /* 381 * Don't update the window size or return credits since RX 382 * flow control is disabled. 383 */ 384 385 so = inp->inp_socket; 386 sb = &so->so_rcv; 387 SOCKBUF_LOCK(sb); 388 389 icc = toep->ulpcb; 390 if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { 391 CTR5(KTR_CXGBE, 392 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 393 __func__, tid, pdu_len, icc, sb->sb_state); 394 SOCKBUF_UNLOCK(sb); 395 INP_WUNLOCK(inp); 396 397 CURVNET_SET(so->so_vnet); 398 NET_EPOCH_ENTER(et); 399 INP_WLOCK(inp); 400 tp = tcp_drop(tp, ECONNRESET); 401 if (tp) 402 INP_WUNLOCK(inp); 403 NET_EPOCH_EXIT(et); 404 CURVNET_RESTORE(); 405 406 icl_cxgbei_conn_pdu_free(NULL, ip); 407 toep->ulpcb2 = NULL; 408 return (0); 409 } 410 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 411 ic = &icc->ic; 412 if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR | 413 F_DDP_DATACRC_ERR)) != 0) { 414 SOCKBUF_UNLOCK(sb); 415 INP_WUNLOCK(inp); 416 417 icl_cxgbei_conn_pdu_free(NULL, ip); 418 toep->ulpcb2 = NULL; 419 ic->ic_error(ic); 420 return (0); 421 } 422 icl_cxgbei_new_pdu_set_conn(ip, ic); 423 424 MPASS(m == NULL); /* was unused, we'll use it now. */ 425 m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */ 426 if (__predict_false(m != NULL)) { 427 int len = m_length(m, NULL); 428 429 /* 430 * PDUs were received before the tid transitioned to ULP mode. 431 * Convert them to icl_cxgbei_pdus and send them to ICL before 432 * the PDU in icp/ip. 433 */ 434 CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid, 435 len); 436 437 /* XXXNP: needs to be rewritten. */ 438 if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct 439 iscsi_bhs)) { 440 struct icl_cxgbei_pdu *icp0; 441 struct icl_pdu *ip0; 442 443 ip0 = icl_cxgbei_new_pdu(M_NOWAIT); 444 if (ip0 == NULL) 445 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 446 icl_cxgbei_new_pdu_set_conn(ip0, ic); 447 icp0 = ip_to_icp(ip0); 448 icp0->icp_seq = 0; /* XXX */ 449 icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS; 450 m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs); 451 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next); 452 } 453 m_freem(m); 454 } 455 456 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 457 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 458 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 459 460 mtx_lock(&cwt->cwt_lock); 461 icc->rx_flags |= RXF_ACTIVE; 462 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 463 if (cwt->cwt_state == CWT_SLEEPING) { 464 cwt->cwt_state = CWT_RUNNING; 465 cv_signal(&cwt->cwt_cv); 466 } 467 mtx_unlock(&cwt->cwt_lock); 468 } 469 SOCKBUF_UNLOCK(sb); 470 INP_WUNLOCK(inp); 471 472 toep->ulpcb2 = NULL; 473 474 return (0); 475 } 476 477 static int 478 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 479 { 480 struct epoch_tracker et; 481 struct adapter *sc = iq->adapter; 482 struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *); 483 u_int tid = GET_TID(cpl); 484 struct toepcb *toep = lookup_tid(sc, tid); 485 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 486 struct icl_pdu *ip; 487 struct cxgbei_cmp *cmp; 488 struct inpcb *inp = toep->inp; 489 #ifdef INVARIANTS 490 uint16_t len = be16toh(cpl->len); 491 #endif 492 struct socket *so; 493 struct sockbuf *sb; 494 struct tcpcb *tp; 495 struct icl_cxgbei_conn *icc; 496 struct icl_conn *ic; 497 struct iscsi_bhs_data_out *bhsdo; 498 u_int val = be32toh(cpl->ddpvld); 499 u_int npdus, pdu_len, data_digest_len, hdr_digest_len; 500 uint32_t prev_seg_len; 501 502 M_ASSERTPKTHDR(m); 503 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 504 505 if ((val & F_DDP_PDU) == 0) { 506 MPASS(icp != NULL); 507 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 508 ip = &icp->ip; 509 } 510 511 if (icp == NULL) { 512 /* T6 completion enabled, start of a new PDU. */ 513 ip = icl_cxgbei_new_pdu(M_NOWAIT); 514 if (ip == NULL) 515 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 516 icp = ip_to_icp(ip); 517 } 518 pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp)); 519 520 #if 0 521 CTR5(KTR_CXGBE, 522 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p", 523 __func__, tid, pdu_len, val, icp); 524 #endif 525 526 /* Copy header */ 527 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 528 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; 529 ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 | 530 bhsdo->bhsdo_data_segment_len[1] << 8 | 531 bhsdo->bhsdo_data_segment_len[2]; 532 icp->icp_seq = ntohl(cpl->seq); 533 icp->icp_flags |= ICPF_RX_HDR; 534 icp->icp_flags |= ICPF_RX_STATUS; 535 536 if (val & F_DDP_PADDING_ERR) { 537 ICL_WARN("received PDU 0x%02x with invalid padding", 538 ip->ip_bhs->bhs_opcode); 539 toep->ofld_rxq->rx_iscsi_padding_errors++; 540 } 541 if (val & F_DDP_HDRCRC_ERR) { 542 ICL_WARN("received PDU 0x%02x with invalid header digest", 543 ip->ip_bhs->bhs_opcode); 544 toep->ofld_rxq->rx_iscsi_header_digest_errors++; 545 } 546 if (val & F_DDP_DATACRC_ERR) { 547 ICL_WARN("received PDU 0x%02x with invalid data digest", 548 ip->ip_bhs->bhs_opcode); 549 toep->ofld_rxq->rx_iscsi_data_digest_errors++; 550 } 551 552 INP_WLOCK(inp); 553 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 554 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 555 __func__, tid, pdu_len, inp->inp_flags); 556 INP_WUNLOCK(inp); 557 icl_cxgbei_conn_pdu_free(NULL, ip); 558 toep->ulpcb2 = NULL; 559 m_freem(m); 560 return (0); 561 } 562 563 tp = intotcpcb(inp); 564 565 /* 566 * If icc is NULL, the connection is being closed in 567 * icl_cxgbei_conn_close(), just drop this data. 568 */ 569 icc = toep->ulpcb; 570 if (__predict_false(icc == NULL)) { 571 CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p", 572 __func__, tid, pdu_len, icc); 573 574 /* 575 * Update rcv_nxt so the sequence number of the FIN 576 * doesn't appear wrong. 577 */ 578 tp->rcv_nxt = icp->icp_seq + pdu_len; 579 tp->t_rcvtime = ticks; 580 INP_WUNLOCK(inp); 581 582 icl_cxgbei_conn_pdu_free(NULL, ip); 583 toep->ulpcb2 = NULL; 584 m_freem(m); 585 return (0); 586 } 587 588 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 589 ic = &icc->ic; 590 if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR | 591 F_DDP_DATACRC_ERR)) != 0) { 592 INP_WUNLOCK(inp); 593 594 icl_cxgbei_conn_pdu_free(NULL, ip); 595 toep->ulpcb2 = NULL; 596 m_freem(m); 597 ic->ic_error(ic); 598 return (0); 599 } 600 601 data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ? 602 ISCSI_DATA_DIGEST_SIZE : 0; 603 hdr_digest_len = (icc->ulp_submode & ULP_CRC_HEADER) ? 604 ISCSI_HEADER_DIGEST_SIZE : 0; 605 MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len); 606 607 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 608 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 609 MPASS(ip->ip_data_len > 0); 610 icp->icp_flags |= ICPF_RX_DDP; 611 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; 612 613 switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) { 614 case ISCSI_BHS_OPCODE_SCSI_DATA_IN: 615 cmp = cxgbei_find_cmp(icc, 616 be32toh(bhsdo->bhsdo_initiator_task_tag)); 617 break; 618 case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: 619 cmp = cxgbei_find_cmp(icc, 620 be32toh(bhsdo->bhsdo_target_transfer_tag)); 621 break; 622 default: 623 __assert_unreachable(); 624 } 625 MPASS(cmp != NULL); 626 627 /* 628 * The difference between the end of the last burst 629 * and the offset of the last PDU in this burst is 630 * the additional data received via DDP. 631 */ 632 prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) - 633 cmp->next_buffer_offset; 634 635 if (prev_seg_len != 0) { 636 uint32_t orig_datasn; 637 638 /* 639 * Return a "large" PDU representing the burst 640 * of PDUs. Adjust the offset and length of 641 * this PDU to represent the entire burst. 642 */ 643 ip->ip_data_len += prev_seg_len; 644 bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len; 645 bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8; 646 bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16; 647 bhsdo->bhsdo_buffer_offset = 648 htobe32(cmp->next_buffer_offset); 649 650 orig_datasn = htobe32(bhsdo->bhsdo_datasn); 651 npdus = orig_datasn - cmp->last_datasn; 652 bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1); 653 cmp->last_datasn = orig_datasn; 654 ip->ip_additional_pdus = npdus - 1; 655 } else { 656 MPASS(htobe32(bhsdo->bhsdo_datasn) == 657 cmp->last_datasn + 1); 658 npdus = 1; 659 cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn); 660 } 661 662 cmp->next_buffer_offset += ip->ip_data_len; 663 toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus; 664 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; 665 } else { 666 MPASS(icp->icp_flags & (ICPF_RX_FLBUF)); 667 MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len); 668 } 669 670 tp->rcv_nxt = icp->icp_seq + pdu_len; 671 tp->t_rcvtime = ticks; 672 673 /* 674 * Don't update the window size or return credits since RX 675 * flow control is disabled. 676 */ 677 678 so = inp->inp_socket; 679 sb = &so->so_rcv; 680 SOCKBUF_LOCK(sb); 681 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 682 CTR5(KTR_CXGBE, 683 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 684 __func__, tid, pdu_len, icc, sb->sb_state); 685 SOCKBUF_UNLOCK(sb); 686 INP_WUNLOCK(inp); 687 688 CURVNET_SET(so->so_vnet); 689 NET_EPOCH_ENTER(et); 690 INP_WLOCK(inp); 691 tp = tcp_drop(tp, ECONNRESET); 692 if (tp != NULL) 693 INP_WUNLOCK(inp); 694 NET_EPOCH_EXIT(et); 695 CURVNET_RESTORE(); 696 697 icl_cxgbei_conn_pdu_free(NULL, ip); 698 toep->ulpcb2 = NULL; 699 m_freem(m); 700 return (0); 701 } 702 icl_cxgbei_new_pdu_set_conn(ip, ic); 703 704 /* Enqueue the PDU to the received pdus queue. */ 705 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 706 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 707 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 708 709 mtx_lock(&cwt->cwt_lock); 710 icc->rx_flags |= RXF_ACTIVE; 711 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 712 if (cwt->cwt_state == CWT_SLEEPING) { 713 cwt->cwt_state = CWT_RUNNING; 714 cv_signal(&cwt->cwt_cv); 715 } 716 mtx_unlock(&cwt->cwt_lock); 717 } 718 SOCKBUF_UNLOCK(sb); 719 INP_WUNLOCK(inp); 720 721 toep->ulpcb2 = NULL; 722 m_freem(m); 723 724 return (0); 725 } 726 727 static int 728 cxgbei_activate(struct adapter *sc) 729 { 730 struct cxgbei_data *ci; 731 int rc; 732 733 ASSERT_SYNCHRONIZED_OP(sc); 734 735 if (uld_active(sc, ULD_ISCSI)) { 736 KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", 737 __func__, sc)); 738 return (0); 739 } 740 741 if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { 742 device_printf(sc->dev, 743 "not iSCSI offload capable, or capability disabled.\n"); 744 return (ENOSYS); 745 } 746 747 /* per-adapter softc for iSCSI */ 748 ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); 749 if (ci == NULL) 750 return (ENOMEM); 751 752 rc = cxgbei_init(sc, ci); 753 if (rc != 0) { 754 free(ci, M_CXGBE); 755 return (rc); 756 } 757 758 sc->iscsi_ulp_softc = ci; 759 760 return (0); 761 } 762 763 static int 764 cxgbei_deactivate(struct adapter *sc) 765 { 766 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 767 768 ASSERT_SYNCHRONIZED_OP(sc); 769 770 if (ci != NULL) { 771 sysctl_ctx_free(&ci->ctx); 772 t4_free_ppod_region(&ci->pr); 773 free(ci, M_CXGBE); 774 sc->iscsi_ulp_softc = NULL; 775 } 776 777 return (0); 778 } 779 780 static void 781 cxgbei_activate_all(struct adapter *sc, void *arg __unused) 782 { 783 784 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) 785 return; 786 787 /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ 788 if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) 789 (void) t4_activate_uld(sc, ULD_ISCSI); 790 791 end_synchronized_op(sc, 0); 792 } 793 794 static void 795 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) 796 { 797 798 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) 799 return; 800 801 if (uld_active(sc, ULD_ISCSI)) 802 (void) t4_deactivate_uld(sc, ULD_ISCSI); 803 804 end_synchronized_op(sc, 0); 805 } 806 807 static struct uld_info cxgbei_uld_info = { 808 .uld_id = ULD_ISCSI, 809 .activate = cxgbei_activate, 810 .deactivate = cxgbei_deactivate, 811 }; 812 813 static void 814 cwt_main(void *arg) 815 { 816 struct cxgbei_worker_thread_softc *cwt = arg; 817 struct icl_cxgbei_conn *icc = NULL; 818 struct icl_conn *ic; 819 struct icl_pdu *ip; 820 struct sockbuf *sb; 821 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); 822 823 MPASS(cwt != NULL); 824 825 mtx_lock(&cwt->cwt_lock); 826 MPASS(cwt->cwt_state == 0); 827 cwt->cwt_state = CWT_RUNNING; 828 cv_signal(&cwt->cwt_cv); 829 830 while (__predict_true(cwt->cwt_state != CWT_STOP)) { 831 cwt->cwt_state = CWT_RUNNING; 832 while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { 833 TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); 834 mtx_unlock(&cwt->cwt_lock); 835 836 ic = &icc->ic; 837 sb = &ic->ic_socket->so_rcv; 838 839 SOCKBUF_LOCK(sb); 840 MPASS(icc->rx_flags & RXF_ACTIVE); 841 if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { 842 MPASS(STAILQ_EMPTY(&rx_pdus)); 843 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); 844 SOCKBUF_UNLOCK(sb); 845 846 /* Hand over PDUs to ICL. */ 847 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { 848 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); 849 ic->ic_receive(ip); 850 } 851 852 SOCKBUF_LOCK(sb); 853 MPASS(STAILQ_EMPTY(&rx_pdus)); 854 } 855 MPASS(icc->rx_flags & RXF_ACTIVE); 856 if (STAILQ_EMPTY(&icc->rcvd_pdus) || 857 __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 858 icc->rx_flags &= ~RXF_ACTIVE; 859 } else { 860 /* 861 * More PDUs were received while we were busy 862 * handing over the previous batch to ICL. 863 * Re-add this connection to the end of the 864 * queue. 865 */ 866 mtx_lock(&cwt->cwt_lock); 867 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, 868 rx_link); 869 mtx_unlock(&cwt->cwt_lock); 870 } 871 SOCKBUF_UNLOCK(sb); 872 873 mtx_lock(&cwt->cwt_lock); 874 } 875 876 /* Inner loop doesn't check for CWT_STOP, do that first. */ 877 if (__predict_false(cwt->cwt_state == CWT_STOP)) 878 break; 879 cwt->cwt_state = CWT_SLEEPING; 880 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 881 } 882 883 MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); 884 mtx_assert(&cwt->cwt_lock, MA_OWNED); 885 cwt->cwt_state = CWT_STOPPED; 886 cv_signal(&cwt->cwt_cv); 887 mtx_unlock(&cwt->cwt_lock); 888 kthread_exit(); 889 } 890 891 static int 892 start_worker_threads(void) 893 { 894 int i, rc; 895 struct cxgbei_worker_thread_softc *cwt; 896 897 worker_thread_count = min(mp_ncpus, 32); 898 cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, 899 M_WAITOK | M_ZERO); 900 901 MPASS(cxgbei_proc == NULL); 902 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 903 mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); 904 cv_init(&cwt->cwt_cv, "cwt cv"); 905 TAILQ_INIT(&cwt->rx_head); 906 rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, 907 "cxgbei", "%d", i); 908 if (rc != 0) { 909 printf("cxgbei: failed to start thread #%d/%d (%d)\n", 910 i + 1, worker_thread_count, rc); 911 mtx_destroy(&cwt->cwt_lock); 912 cv_destroy(&cwt->cwt_cv); 913 bzero(cwt, sizeof(*cwt)); 914 if (i == 0) { 915 free(cwt_softc, M_CXGBE); 916 worker_thread_count = 0; 917 918 return (rc); 919 } 920 921 /* Not fatal, carry on with fewer threads. */ 922 worker_thread_count = i; 923 rc = 0; 924 break; 925 } 926 927 /* Wait for thread to start before moving on to the next one. */ 928 mtx_lock(&cwt->cwt_lock); 929 while (cwt->cwt_state == 0) 930 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 931 mtx_unlock(&cwt->cwt_lock); 932 } 933 934 MPASS(cwt_softc != NULL); 935 MPASS(worker_thread_count > 0); 936 return (0); 937 } 938 939 static void 940 stop_worker_threads(void) 941 { 942 int i; 943 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; 944 945 MPASS(worker_thread_count >= 0); 946 947 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 948 mtx_lock(&cwt->cwt_lock); 949 MPASS(cwt->cwt_state == CWT_RUNNING || 950 cwt->cwt_state == CWT_SLEEPING); 951 cwt->cwt_state = CWT_STOP; 952 cv_signal(&cwt->cwt_cv); 953 do { 954 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 955 } while (cwt->cwt_state != CWT_STOPPED); 956 mtx_unlock(&cwt->cwt_lock); 957 mtx_destroy(&cwt->cwt_lock); 958 cv_destroy(&cwt->cwt_cv); 959 } 960 free(cwt_softc, M_CXGBE); 961 } 962 963 /* Select a worker thread for a connection. */ 964 u_int 965 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc) 966 { 967 struct adapter *sc = icc->sc; 968 struct toepcb *toep = icc->toep; 969 u_int i, n; 970 971 n = worker_thread_count / sc->sge.nofldrxq; 972 if (n > 0) 973 i = toep->vi->pi->port_id * n + arc4random() % n; 974 else 975 i = arc4random() % worker_thread_count; 976 977 CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i); 978 979 return (i); 980 } 981 982 static int 983 cxgbei_mod_load(void) 984 { 985 int rc; 986 987 t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr); 988 t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data); 989 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); 990 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp); 991 992 rc = start_worker_threads(); 993 if (rc != 0) 994 return (rc); 995 996 rc = t4_register_uld(&cxgbei_uld_info); 997 if (rc != 0) { 998 stop_worker_threads(); 999 return (rc); 1000 } 1001 1002 t4_iterate(cxgbei_activate_all, NULL); 1003 1004 return (rc); 1005 } 1006 1007 static int 1008 cxgbei_mod_unload(void) 1009 { 1010 1011 t4_iterate(cxgbei_deactivate_all, NULL); 1012 1013 if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) 1014 return (EBUSY); 1015 1016 stop_worker_threads(); 1017 1018 t4_register_cpl_handler(CPL_ISCSI_HDR, NULL); 1019 t4_register_cpl_handler(CPL_ISCSI_DATA, NULL); 1020 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL); 1021 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL); 1022 1023 return (0); 1024 } 1025 #endif 1026 1027 static int 1028 cxgbei_modevent(module_t mod, int cmd, void *arg) 1029 { 1030 int rc = 0; 1031 1032 #ifdef TCP_OFFLOAD 1033 switch (cmd) { 1034 case MOD_LOAD: 1035 rc = cxgbei_mod_load(); 1036 if (rc == 0) 1037 rc = icl_cxgbei_mod_load(); 1038 break; 1039 1040 case MOD_UNLOAD: 1041 rc = icl_cxgbei_mod_unload(); 1042 if (rc == 0) 1043 rc = cxgbei_mod_unload(); 1044 break; 1045 1046 default: 1047 rc = EINVAL; 1048 } 1049 #else 1050 printf("cxgbei: compiled without TCP_OFFLOAD support.\n"); 1051 rc = EOPNOTSUPP; 1052 #endif 1053 1054 return (rc); 1055 } 1056 1057 static moduledata_t cxgbei_mod = { 1058 "cxgbei", 1059 cxgbei_modevent, 1060 NULL, 1061 }; 1062 1063 MODULE_VERSION(cxgbei, 1); 1064 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1065 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); 1066 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); 1067 MODULE_DEPEND(cxgbei, icl, 1, 1, 1); 1068