1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Chelsio T5xx iSCSI driver 6 * 7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet.h" 35 #include "opt_inet6.h" 36 37 #include <sys/types.h> 38 #include <sys/param.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/module.h> 42 #include <sys/systm.h> 43 44 #ifdef TCP_OFFLOAD 45 #include <sys/errno.h> 46 #include <sys/kthread.h> 47 #include <sys/smp.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/mbuf.h> 51 #include <sys/lock.h> 52 #include <sys/mutex.h> 53 #include <sys/condvar.h> 54 55 #include <netinet/in.h> 56 #include <netinet/in_pcb.h> 57 #include <netinet/toecore.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/tcp_fsm.h> 60 61 #include <cam/scsi/scsi_all.h> 62 #include <cam/scsi/scsi_da.h> 63 #include <cam/ctl/ctl_io.h> 64 #include <cam/ctl/ctl.h> 65 #include <cam/ctl/ctl_backend.h> 66 #include <cam/ctl/ctl_error.h> 67 #include <cam/ctl/ctl_frontend.h> 68 #include <cam/ctl/ctl_debug.h> 69 #include <cam/ctl/ctl_ha.h> 70 #include <cam/ctl/ctl_ioctl.h> 71 72 #include <dev/iscsi/icl.h> 73 #include <dev/iscsi/iscsi_proto.h> 74 #include <dev/iscsi/iscsi_ioctl.h> 75 #include <dev/iscsi/iscsi.h> 76 #include <cam/ctl/ctl_frontend_iscsi.h> 77 78 #include <cam/cam.h> 79 #include <cam/cam_ccb.h> 80 #include <cam/cam_xpt.h> 81 #include <cam/cam_debug.h> 82 #include <cam/cam_sim.h> 83 #include <cam/cam_xpt_sim.h> 84 #include <cam/cam_xpt_periph.h> 85 #include <cam/cam_periph.h> 86 #include <cam/cam_compat.h> 87 #include <cam/scsi/scsi_message.h> 88 89 #include "common/common.h" 90 #include "common/t4_msg.h" 91 #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ 92 #include "tom/t4_tom.h" 93 #include "cxgbei.h" 94 95 static int worker_thread_count; 96 static struct cxgbei_worker_thread_softc *cwt_softc; 97 static struct proc *cxgbei_proc; 98 99 static void 100 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len, 101 uint32_t *max_rx_data_len, struct ppod_region *pr) 102 { 103 uint32_t tx_len, rx_len, r, v; 104 105 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); 106 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); 107 108 r = t4_read_reg(sc, A_TP_PARA_REG2); 109 rx_len = min(rx_len, G_MAXRXDATA(r)); 110 tx_len = min(tx_len, G_MAXRXDATA(r)); 111 112 r = t4_read_reg(sc, A_TP_PARA_REG7); 113 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); 114 rx_len = min(rx_len, v); 115 tx_len = min(tx_len, v); 116 117 /* 118 * AHS is not supported by the kernel so we'll not account for 119 * it either in our PDU len -> data segment len conversions. 120 */ 121 rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + 122 ISCSI_DATA_DIGEST_SIZE; 123 tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE + 124 ISCSI_DATA_DIGEST_SIZE; 125 126 /* 127 * DDP can place only 4 pages for a single PDU. A single 128 * request might use larger pages than the smallest page size, 129 * but that cannot be guaranteed. Assume the smallest DDP 130 * page size for this limit. 131 */ 132 rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0])); 133 134 if (chip_id(sc) == CHELSIO_T5) { 135 tx_len = min(tx_len, 15360); 136 137 rx_len = rounddown2(rx_len, 512); 138 tx_len = rounddown2(tx_len, 512); 139 } 140 141 *max_tx_data_len = tx_len; 142 *max_rx_data_len = rx_len; 143 } 144 145 /* 146 * Initialize the software state of the iSCSI ULP driver. 147 * 148 * ENXIO means firmware didn't set up something that it was supposed to. 149 */ 150 static int 151 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) 152 { 153 struct sysctl_oid *oid; 154 struct sysctl_oid_list *children; 155 struct ppod_region *pr; 156 uint32_t r; 157 int rc; 158 159 MPASS(sc->vres.iscsi.size > 0); 160 MPASS(ci != NULL); 161 162 pr = &ci->pr; 163 r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); 164 rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); 165 if (rc != 0) { 166 device_printf(sc->dev, 167 "%s: failed to initialize the iSCSI page pod region: %u.\n", 168 __func__, rc); 169 return (rc); 170 } 171 172 r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); 173 r &= V_ISCSITAGMASK(M_ISCSITAGMASK); 174 if (r != pr->pr_tag_mask) { 175 /* 176 * Recent firmwares are supposed to set up the iSCSI tagmask 177 * but we'll do it ourselves it the computed value doesn't match 178 * what's in the register. 179 */ 180 device_printf(sc->dev, 181 "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, 182 pr->pr_tag_mask); 183 t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, 184 V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); 185 } 186 187 read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr); 188 189 sysctl_ctx_init(&ci->ctx); 190 oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ 191 children = SYSCTL_CHILDREN(oid); 192 193 oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", 194 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings"); 195 children = SYSCTL_CHILDREN(oid); 196 197 ci->ddp_threshold = 2048; 198 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", 199 CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); 200 201 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len", 202 CTLFLAG_RD, &ci->max_rx_data_len, 0, 203 "Maximum receive data segment length"); 204 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len", 205 CTLFLAG_RD, &ci->max_tx_data_len, 0, 206 "Maximum transmit data segment length"); 207 208 return (0); 209 } 210 211 static int 212 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 213 { 214 struct adapter *sc = iq->adapter; 215 struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); 216 u_int tid = GET_TID(cpl); 217 struct toepcb *toep = lookup_tid(sc, tid); 218 struct icl_pdu *ip; 219 struct icl_cxgbei_pdu *icp; 220 uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); 221 uint16_t len = be16toh(cpl->len); 222 223 M_ASSERTPKTHDR(m); 224 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 225 226 ip = icl_cxgbei_new_pdu(M_NOWAIT); 227 if (ip == NULL) 228 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 229 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 230 ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; 231 icp = ip_to_icp(ip); 232 icp->icp_seq = ntohl(cpl->seq); 233 icp->icp_flags = ICPF_RX_HDR; 234 235 /* This is the start of a new PDU. There should be no old state. */ 236 MPASS(toep->ulpcb2 == NULL); 237 toep->ulpcb2 = icp; 238 239 #if 0 240 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", 241 __func__, tid, len, len_ddp, icp); 242 #endif 243 244 m_freem(m); 245 return (0); 246 } 247 248 static int 249 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 250 { 251 struct adapter *sc = iq->adapter; 252 struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); 253 u_int tid = GET_TID(cpl); 254 struct toepcb *toep = lookup_tid(sc, tid); 255 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 256 struct icl_pdu *ip; 257 258 M_ASSERTPKTHDR(m); 259 MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); 260 261 if (icp == NULL) { 262 /* 263 * T6 completion enabled, start of a new pdu. Header 264 * will come in completion CPL. 265 */ 266 ip = icl_cxgbei_new_pdu(M_NOWAIT); 267 if (ip == NULL) 268 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 269 icp = ip_to_icp(ip); 270 } else { 271 /* T5 mode, header is already received. */ 272 MPASS(icp->icp_flags == ICPF_RX_HDR); 273 MPASS(icp->ip.ip_data_mbuf == NULL); 274 MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl)); 275 } 276 277 /* Trim the cpl header from mbuf. */ 278 m_adj(m, sizeof(*cpl)); 279 280 icp->icp_flags |= ICPF_RX_FLBUF; 281 icp->ip.ip_data_mbuf = m; 282 toep->ofld_rxq->rx_iscsi_fl_pdus++; 283 toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len; 284 285 /* 286 * For T6, save the icp for further processing in the 287 * completion handler. 288 */ 289 if (icp->icp_flags == ICPF_RX_FLBUF) { 290 MPASS(toep->ulpcb2 == NULL); 291 toep->ulpcb2 = icp; 292 } 293 294 #if 0 295 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid, 296 be16toh(cpl->len), icp); 297 #endif 298 299 return (0); 300 } 301 302 static int 303 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 304 { 305 struct adapter *sc = iq->adapter; 306 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 307 u_int tid = GET_TID(cpl); 308 struct toepcb *toep = lookup_tid(sc, tid); 309 struct inpcb *inp = toep->inp; 310 struct socket *so; 311 struct sockbuf *sb; 312 struct tcpcb *tp; 313 struct icl_cxgbei_conn *icc; 314 struct icl_conn *ic; 315 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 316 struct icl_pdu *ip; 317 u_int pdu_len, val; 318 struct epoch_tracker et; 319 320 MPASS(m == NULL); 321 322 /* Must already be assembling a PDU. */ 323 MPASS(icp != NULL); 324 MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ 325 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 326 327 pdu_len = be16toh(cpl->len); /* includes everything. */ 328 val = be32toh(cpl->ddpvld); 329 330 #if 0 331 CTR5(KTR_CXGBE, 332 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", 333 __func__, tid, pdu_len, val, icp->icp_flags); 334 #endif 335 336 icp->icp_flags |= ICPF_RX_STATUS; 337 ip = &icp->ip; 338 if (val & F_DDP_PADDING_ERR) { 339 ICL_WARN("received PDU 0x%02x with invalid padding", 340 ip->ip_bhs->bhs_opcode); 341 toep->ofld_rxq->rx_iscsi_padding_errors++; 342 } 343 if (val & F_DDP_HDRCRC_ERR) { 344 ICL_WARN("received PDU 0x%02x with invalid header digest", 345 ip->ip_bhs->bhs_opcode); 346 toep->ofld_rxq->rx_iscsi_header_digest_errors++; 347 } 348 if (val & F_DDP_DATACRC_ERR) { 349 ICL_WARN("received PDU 0x%02x with invalid data digest", 350 ip->ip_bhs->bhs_opcode); 351 toep->ofld_rxq->rx_iscsi_data_digest_errors++; 352 } 353 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 354 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 355 MPASS(ip->ip_data_len > 0); 356 icp->icp_flags |= ICPF_RX_DDP; 357 toep->ofld_rxq->rx_iscsi_ddp_pdus++; 358 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; 359 } 360 361 INP_WLOCK(inp); 362 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 363 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 364 __func__, tid, pdu_len, inp->inp_flags); 365 INP_WUNLOCK(inp); 366 icl_cxgbei_conn_pdu_free(NULL, ip); 367 toep->ulpcb2 = NULL; 368 return (0); 369 } 370 371 /* 372 * T6+ does not report data PDUs received via DDP without F 373 * set. This can result in gaps in the TCP sequence space. 374 */ 375 tp = intotcpcb(inp); 376 MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt); 377 tp->rcv_nxt = icp->icp_seq + pdu_len; 378 tp->t_rcvtime = ticks; 379 380 /* 381 * Don't update the window size or return credits since RX 382 * flow control is disabled. 383 */ 384 385 so = inp->inp_socket; 386 sb = &so->so_rcv; 387 SOCKBUF_LOCK(sb); 388 389 icc = toep->ulpcb; 390 if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { 391 CTR5(KTR_CXGBE, 392 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 393 __func__, tid, pdu_len, icc, sb->sb_state); 394 SOCKBUF_UNLOCK(sb); 395 INP_WUNLOCK(inp); 396 397 CURVNET_SET(so->so_vnet); 398 NET_EPOCH_ENTER(et); 399 INP_WLOCK(inp); 400 tp = tcp_drop(tp, ECONNRESET); 401 if (tp) 402 INP_WUNLOCK(inp); 403 NET_EPOCH_EXIT(et); 404 CURVNET_RESTORE(); 405 406 icl_cxgbei_conn_pdu_free(NULL, ip); 407 toep->ulpcb2 = NULL; 408 return (0); 409 } 410 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 411 ic = &icc->ic; 412 if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR | 413 F_DDP_DATACRC_ERR)) != 0) { 414 SOCKBUF_UNLOCK(sb); 415 INP_WUNLOCK(inp); 416 417 icl_cxgbei_conn_pdu_free(NULL, ip); 418 toep->ulpcb2 = NULL; 419 ic->ic_error(ic); 420 return (0); 421 } 422 icl_cxgbei_new_pdu_set_conn(ip, ic); 423 424 MPASS(m == NULL); /* was unused, we'll use it now. */ 425 m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */ 426 if (__predict_false(m != NULL)) { 427 int len = m_length(m, NULL); 428 429 /* 430 * PDUs were received before the tid transitioned to ULP mode. 431 * Convert them to icl_cxgbei_pdus and send them to ICL before 432 * the PDU in icp/ip. 433 */ 434 CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid, 435 len); 436 437 /* XXXNP: needs to be rewritten. */ 438 if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct 439 iscsi_bhs)) { 440 struct icl_cxgbei_pdu *icp0; 441 struct icl_pdu *ip0; 442 443 ip0 = icl_cxgbei_new_pdu(M_NOWAIT); 444 if (ip0 == NULL) 445 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 446 icl_cxgbei_new_pdu_set_conn(ip0, ic); 447 icp0 = ip_to_icp(ip0); 448 icp0->icp_seq = 0; /* XXX */ 449 icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS; 450 m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs); 451 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next); 452 } 453 m_freem(m); 454 } 455 456 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 457 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 458 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 459 460 mtx_lock(&cwt->cwt_lock); 461 icc->rx_flags |= RXF_ACTIVE; 462 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 463 if (cwt->cwt_state == CWT_SLEEPING) { 464 cwt->cwt_state = CWT_RUNNING; 465 cv_signal(&cwt->cwt_cv); 466 } 467 mtx_unlock(&cwt->cwt_lock); 468 } 469 SOCKBUF_UNLOCK(sb); 470 INP_WUNLOCK(inp); 471 472 toep->ulpcb2 = NULL; 473 474 return (0); 475 } 476 477 static int 478 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 479 { 480 struct epoch_tracker et; 481 struct adapter *sc = iq->adapter; 482 struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *); 483 u_int tid = GET_TID(cpl); 484 struct toepcb *toep = lookup_tid(sc, tid); 485 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 486 struct icl_pdu *ip; 487 struct cxgbei_cmp *cmp; 488 struct inpcb *inp = toep->inp; 489 #ifdef INVARIANTS 490 uint16_t len = be16toh(cpl->len); 491 u_int data_digest_len; 492 #endif 493 struct socket *so; 494 struct sockbuf *sb; 495 struct tcpcb *tp; 496 struct icl_cxgbei_conn *icc; 497 struct icl_conn *ic; 498 struct iscsi_bhs_data_out *bhsdo; 499 u_int val = be32toh(cpl->ddpvld); 500 u_int npdus, pdu_len; 501 uint32_t prev_seg_len; 502 503 M_ASSERTPKTHDR(m); 504 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 505 506 if ((val & F_DDP_PDU) == 0) { 507 MPASS(icp != NULL); 508 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 509 ip = &icp->ip; 510 } 511 512 if (icp == NULL) { 513 /* T6 completion enabled, start of a new PDU. */ 514 ip = icl_cxgbei_new_pdu(M_NOWAIT); 515 if (ip == NULL) 516 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 517 icp = ip_to_icp(ip); 518 } 519 pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp)); 520 521 #if 0 522 CTR5(KTR_CXGBE, 523 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p", 524 __func__, tid, pdu_len, val, icp); 525 #endif 526 527 /* Copy header */ 528 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 529 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; 530 ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 | 531 bhsdo->bhsdo_data_segment_len[1] << 8 | 532 bhsdo->bhsdo_data_segment_len[2]; 533 icp->icp_seq = ntohl(cpl->seq); 534 icp->icp_flags |= ICPF_RX_HDR; 535 icp->icp_flags |= ICPF_RX_STATUS; 536 537 if (val & F_DDP_PADDING_ERR) { 538 ICL_WARN("received PDU 0x%02x with invalid padding", 539 ip->ip_bhs->bhs_opcode); 540 toep->ofld_rxq->rx_iscsi_padding_errors++; 541 } 542 if (val & F_DDP_HDRCRC_ERR) { 543 ICL_WARN("received PDU 0x%02x with invalid header digest", 544 ip->ip_bhs->bhs_opcode); 545 toep->ofld_rxq->rx_iscsi_header_digest_errors++; 546 } 547 if (val & F_DDP_DATACRC_ERR) { 548 ICL_WARN("received PDU 0x%02x with invalid data digest", 549 ip->ip_bhs->bhs_opcode); 550 toep->ofld_rxq->rx_iscsi_data_digest_errors++; 551 } 552 553 INP_WLOCK(inp); 554 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 555 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 556 __func__, tid, pdu_len, inp->inp_flags); 557 INP_WUNLOCK(inp); 558 icl_cxgbei_conn_pdu_free(NULL, ip); 559 toep->ulpcb2 = NULL; 560 m_freem(m); 561 return (0); 562 } 563 564 tp = intotcpcb(inp); 565 566 /* 567 * If icc is NULL, the connection is being closed in 568 * icl_cxgbei_conn_close(), just drop this data. 569 */ 570 icc = toep->ulpcb; 571 if (__predict_false(icc == NULL)) { 572 CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p", 573 __func__, tid, pdu_len, icc); 574 575 /* 576 * Update rcv_nxt so the sequence number of the FIN 577 * doesn't appear wrong. 578 */ 579 tp->rcv_nxt = icp->icp_seq + pdu_len; 580 tp->t_rcvtime = ticks; 581 INP_WUNLOCK(inp); 582 583 icl_cxgbei_conn_pdu_free(NULL, ip); 584 toep->ulpcb2 = NULL; 585 m_freem(m); 586 return (0); 587 } 588 589 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 590 ic = &icc->ic; 591 if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR | 592 F_DDP_DATACRC_ERR)) != 0) { 593 INP_WUNLOCK(inp); 594 595 icl_cxgbei_conn_pdu_free(NULL, ip); 596 toep->ulpcb2 = NULL; 597 m_freem(m); 598 ic->ic_error(ic); 599 return (0); 600 } 601 602 #ifdef INVARIANTS 603 data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ? 604 ISCSI_DATA_DIGEST_SIZE : 0; 605 MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len); 606 #endif 607 608 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 609 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 610 MPASS(ip->ip_data_len > 0); 611 icp->icp_flags |= ICPF_RX_DDP; 612 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs; 613 614 switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) { 615 case ISCSI_BHS_OPCODE_SCSI_DATA_IN: 616 cmp = cxgbei_find_cmp(icc, 617 be32toh(bhsdo->bhsdo_initiator_task_tag)); 618 break; 619 case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: 620 cmp = cxgbei_find_cmp(icc, 621 be32toh(bhsdo->bhsdo_target_transfer_tag)); 622 break; 623 default: 624 __assert_unreachable(); 625 } 626 MPASS(cmp != NULL); 627 628 /* 629 * The difference between the end of the last burst 630 * and the offset of the last PDU in this burst is 631 * the additional data received via DDP. 632 */ 633 prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) - 634 cmp->next_buffer_offset; 635 636 if (prev_seg_len != 0) { 637 uint32_t orig_datasn; 638 639 /* 640 * Return a "large" PDU representing the burst 641 * of PDUs. Adjust the offset and length of 642 * this PDU to represent the entire burst. 643 */ 644 ip->ip_data_len += prev_seg_len; 645 bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len; 646 bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8; 647 bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16; 648 bhsdo->bhsdo_buffer_offset = 649 htobe32(cmp->next_buffer_offset); 650 651 orig_datasn = htobe32(bhsdo->bhsdo_datasn); 652 npdus = orig_datasn - cmp->last_datasn; 653 bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1); 654 cmp->last_datasn = orig_datasn; 655 ip->ip_additional_pdus = npdus - 1; 656 } else { 657 MPASS(htobe32(bhsdo->bhsdo_datasn) == 658 cmp->last_datasn + 1); 659 npdus = 1; 660 cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn); 661 } 662 663 cmp->next_buffer_offset += ip->ip_data_len; 664 toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus; 665 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len; 666 } else { 667 MPASS(icp->icp_flags & (ICPF_RX_FLBUF)); 668 MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len); 669 } 670 671 tp->rcv_nxt = icp->icp_seq + pdu_len; 672 tp->t_rcvtime = ticks; 673 674 /* 675 * Don't update the window size or return credits since RX 676 * flow control is disabled. 677 */ 678 679 so = inp->inp_socket; 680 sb = &so->so_rcv; 681 SOCKBUF_LOCK(sb); 682 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 683 CTR5(KTR_CXGBE, 684 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 685 __func__, tid, pdu_len, icc, sb->sb_state); 686 SOCKBUF_UNLOCK(sb); 687 INP_WUNLOCK(inp); 688 689 CURVNET_SET(so->so_vnet); 690 NET_EPOCH_ENTER(et); 691 INP_WLOCK(inp); 692 tp = tcp_drop(tp, ECONNRESET); 693 if (tp != NULL) 694 INP_WUNLOCK(inp); 695 NET_EPOCH_EXIT(et); 696 CURVNET_RESTORE(); 697 698 icl_cxgbei_conn_pdu_free(NULL, ip); 699 toep->ulpcb2 = NULL; 700 m_freem(m); 701 return (0); 702 } 703 icl_cxgbei_new_pdu_set_conn(ip, ic); 704 705 /* Enqueue the PDU to the received pdus queue. */ 706 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 707 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 708 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 709 710 mtx_lock(&cwt->cwt_lock); 711 icc->rx_flags |= RXF_ACTIVE; 712 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 713 if (cwt->cwt_state == CWT_SLEEPING) { 714 cwt->cwt_state = CWT_RUNNING; 715 cv_signal(&cwt->cwt_cv); 716 } 717 mtx_unlock(&cwt->cwt_lock); 718 } 719 SOCKBUF_UNLOCK(sb); 720 INP_WUNLOCK(inp); 721 722 toep->ulpcb2 = NULL; 723 m_freem(m); 724 725 return (0); 726 } 727 728 static int 729 cxgbei_activate(struct adapter *sc) 730 { 731 struct cxgbei_data *ci; 732 int rc; 733 734 ASSERT_SYNCHRONIZED_OP(sc); 735 736 if (uld_active(sc, ULD_ISCSI)) { 737 KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", 738 __func__, sc)); 739 return (0); 740 } 741 742 if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { 743 device_printf(sc->dev, 744 "not iSCSI offload capable, or capability disabled.\n"); 745 return (ENOSYS); 746 } 747 748 /* per-adapter softc for iSCSI */ 749 ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); 750 if (ci == NULL) 751 return (ENOMEM); 752 753 rc = cxgbei_init(sc, ci); 754 if (rc != 0) { 755 free(ci, M_CXGBE); 756 return (rc); 757 } 758 759 sc->iscsi_ulp_softc = ci; 760 761 return (0); 762 } 763 764 static int 765 cxgbei_deactivate(struct adapter *sc) 766 { 767 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 768 769 ASSERT_SYNCHRONIZED_OP(sc); 770 771 if (ci != NULL) { 772 sysctl_ctx_free(&ci->ctx); 773 t4_free_ppod_region(&ci->pr); 774 free(ci, M_CXGBE); 775 sc->iscsi_ulp_softc = NULL; 776 } 777 778 return (0); 779 } 780 781 static void 782 cxgbei_activate_all(struct adapter *sc, void *arg __unused) 783 { 784 785 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) 786 return; 787 788 /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ 789 if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) 790 (void) t4_activate_uld(sc, ULD_ISCSI); 791 792 end_synchronized_op(sc, 0); 793 } 794 795 static void 796 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) 797 { 798 799 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) 800 return; 801 802 if (uld_active(sc, ULD_ISCSI)) 803 (void) t4_deactivate_uld(sc, ULD_ISCSI); 804 805 end_synchronized_op(sc, 0); 806 } 807 808 static struct uld_info cxgbei_uld_info = { 809 .uld_id = ULD_ISCSI, 810 .activate = cxgbei_activate, 811 .deactivate = cxgbei_deactivate, 812 }; 813 814 static void 815 cwt_main(void *arg) 816 { 817 struct cxgbei_worker_thread_softc *cwt = arg; 818 struct icl_cxgbei_conn *icc = NULL; 819 struct icl_conn *ic; 820 struct icl_pdu *ip; 821 struct sockbuf *sb; 822 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); 823 824 MPASS(cwt != NULL); 825 826 mtx_lock(&cwt->cwt_lock); 827 MPASS(cwt->cwt_state == 0); 828 cwt->cwt_state = CWT_RUNNING; 829 cv_signal(&cwt->cwt_cv); 830 831 while (__predict_true(cwt->cwt_state != CWT_STOP)) { 832 cwt->cwt_state = CWT_RUNNING; 833 while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { 834 TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); 835 mtx_unlock(&cwt->cwt_lock); 836 837 ic = &icc->ic; 838 sb = &ic->ic_socket->so_rcv; 839 840 SOCKBUF_LOCK(sb); 841 MPASS(icc->rx_flags & RXF_ACTIVE); 842 if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { 843 MPASS(STAILQ_EMPTY(&rx_pdus)); 844 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); 845 SOCKBUF_UNLOCK(sb); 846 847 /* Hand over PDUs to ICL. */ 848 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { 849 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); 850 ic->ic_receive(ip); 851 } 852 853 SOCKBUF_LOCK(sb); 854 MPASS(STAILQ_EMPTY(&rx_pdus)); 855 } 856 MPASS(icc->rx_flags & RXF_ACTIVE); 857 if (STAILQ_EMPTY(&icc->rcvd_pdus) || 858 __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 859 icc->rx_flags &= ~RXF_ACTIVE; 860 } else { 861 /* 862 * More PDUs were received while we were busy 863 * handing over the previous batch to ICL. 864 * Re-add this connection to the end of the 865 * queue. 866 */ 867 mtx_lock(&cwt->cwt_lock); 868 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, 869 rx_link); 870 mtx_unlock(&cwt->cwt_lock); 871 } 872 SOCKBUF_UNLOCK(sb); 873 874 mtx_lock(&cwt->cwt_lock); 875 } 876 877 /* Inner loop doesn't check for CWT_STOP, do that first. */ 878 if (__predict_false(cwt->cwt_state == CWT_STOP)) 879 break; 880 cwt->cwt_state = CWT_SLEEPING; 881 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 882 } 883 884 MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); 885 mtx_assert(&cwt->cwt_lock, MA_OWNED); 886 cwt->cwt_state = CWT_STOPPED; 887 cv_signal(&cwt->cwt_cv); 888 mtx_unlock(&cwt->cwt_lock); 889 kthread_exit(); 890 } 891 892 static int 893 start_worker_threads(void) 894 { 895 int i, rc; 896 struct cxgbei_worker_thread_softc *cwt; 897 898 worker_thread_count = min(mp_ncpus, 32); 899 cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, 900 M_WAITOK | M_ZERO); 901 902 MPASS(cxgbei_proc == NULL); 903 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 904 mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); 905 cv_init(&cwt->cwt_cv, "cwt cv"); 906 TAILQ_INIT(&cwt->rx_head); 907 rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, 908 "cxgbei", "%d", i); 909 if (rc != 0) { 910 printf("cxgbei: failed to start thread #%d/%d (%d)\n", 911 i + 1, worker_thread_count, rc); 912 mtx_destroy(&cwt->cwt_lock); 913 cv_destroy(&cwt->cwt_cv); 914 bzero(cwt, sizeof(*cwt)); 915 if (i == 0) { 916 free(cwt_softc, M_CXGBE); 917 worker_thread_count = 0; 918 919 return (rc); 920 } 921 922 /* Not fatal, carry on with fewer threads. */ 923 worker_thread_count = i; 924 rc = 0; 925 break; 926 } 927 928 /* Wait for thread to start before moving on to the next one. */ 929 mtx_lock(&cwt->cwt_lock); 930 while (cwt->cwt_state == 0) 931 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 932 mtx_unlock(&cwt->cwt_lock); 933 } 934 935 MPASS(cwt_softc != NULL); 936 MPASS(worker_thread_count > 0); 937 return (0); 938 } 939 940 static void 941 stop_worker_threads(void) 942 { 943 int i; 944 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; 945 946 MPASS(worker_thread_count >= 0); 947 948 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 949 mtx_lock(&cwt->cwt_lock); 950 MPASS(cwt->cwt_state == CWT_RUNNING || 951 cwt->cwt_state == CWT_SLEEPING); 952 cwt->cwt_state = CWT_STOP; 953 cv_signal(&cwt->cwt_cv); 954 do { 955 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 956 } while (cwt->cwt_state != CWT_STOPPED); 957 mtx_unlock(&cwt->cwt_lock); 958 mtx_destroy(&cwt->cwt_lock); 959 cv_destroy(&cwt->cwt_cv); 960 } 961 free(cwt_softc, M_CXGBE); 962 } 963 964 /* Select a worker thread for a connection. */ 965 u_int 966 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc) 967 { 968 struct adapter *sc = icc->sc; 969 struct toepcb *toep = icc->toep; 970 u_int i, n; 971 972 n = worker_thread_count / sc->sge.nofldrxq; 973 if (n > 0) 974 i = toep->vi->pi->port_id * n + arc4random() % n; 975 else 976 i = arc4random() % worker_thread_count; 977 978 CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i); 979 980 return (i); 981 } 982 983 static int 984 cxgbei_mod_load(void) 985 { 986 int rc; 987 988 t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr); 989 t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data); 990 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); 991 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp); 992 993 rc = start_worker_threads(); 994 if (rc != 0) 995 return (rc); 996 997 rc = t4_register_uld(&cxgbei_uld_info); 998 if (rc != 0) { 999 stop_worker_threads(); 1000 return (rc); 1001 } 1002 1003 t4_iterate(cxgbei_activate_all, NULL); 1004 1005 return (rc); 1006 } 1007 1008 static int 1009 cxgbei_mod_unload(void) 1010 { 1011 1012 t4_iterate(cxgbei_deactivate_all, NULL); 1013 1014 if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) 1015 return (EBUSY); 1016 1017 stop_worker_threads(); 1018 1019 t4_register_cpl_handler(CPL_ISCSI_HDR, NULL); 1020 t4_register_cpl_handler(CPL_ISCSI_DATA, NULL); 1021 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL); 1022 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL); 1023 1024 return (0); 1025 } 1026 #endif 1027 1028 static int 1029 cxgbei_modevent(module_t mod, int cmd, void *arg) 1030 { 1031 int rc = 0; 1032 1033 #ifdef TCP_OFFLOAD 1034 switch (cmd) { 1035 case MOD_LOAD: 1036 rc = cxgbei_mod_load(); 1037 if (rc == 0) 1038 rc = icl_cxgbei_mod_load(); 1039 break; 1040 1041 case MOD_UNLOAD: 1042 rc = icl_cxgbei_mod_unload(); 1043 if (rc == 0) 1044 rc = cxgbei_mod_unload(); 1045 break; 1046 1047 default: 1048 rc = EINVAL; 1049 } 1050 #else 1051 printf("cxgbei: compiled without TCP_OFFLOAD support.\n"); 1052 rc = EOPNOTSUPP; 1053 #endif 1054 1055 return (rc); 1056 } 1057 1058 static moduledata_t cxgbei_mod = { 1059 "cxgbei", 1060 cxgbei_modevent, 1061 NULL, 1062 }; 1063 1064 MODULE_VERSION(cxgbei, 1); 1065 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1066 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); 1067 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); 1068 MODULE_DEPEND(cxgbei, icl, 1, 1, 1); 1069