1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * 5 * Chelsio T5xx iSCSI driver 6 * 7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet.h" 35 #include "opt_inet6.h" 36 37 #include <sys/types.h> 38 #include <sys/param.h> 39 #include <sys/kernel.h> 40 #include <sys/module.h> 41 #include <sys/systm.h> 42 43 #ifdef TCP_OFFLOAD 44 #include <sys/errno.h> 45 #include <sys/kthread.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/mbuf.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/condvar.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/toecore.h> 57 #include <netinet/tcp_var.h> 58 #include <netinet/tcp_fsm.h> 59 60 #include <cam/scsi/scsi_all.h> 61 #include <cam/scsi/scsi_da.h> 62 #include <cam/ctl/ctl_io.h> 63 #include <cam/ctl/ctl.h> 64 #include <cam/ctl/ctl_backend.h> 65 #include <cam/ctl/ctl_error.h> 66 #include <cam/ctl/ctl_frontend.h> 67 #include <cam/ctl/ctl_debug.h> 68 #include <cam/ctl/ctl_ha.h> 69 #include <cam/ctl/ctl_ioctl.h> 70 71 #include <dev/iscsi/icl.h> 72 #include <dev/iscsi/iscsi_proto.h> 73 #include <dev/iscsi/iscsi_ioctl.h> 74 #include <dev/iscsi/iscsi.h> 75 #include <cam/ctl/ctl_frontend_iscsi.h> 76 77 #include <cam/cam.h> 78 #include <cam/cam_ccb.h> 79 #include <cam/cam_xpt.h> 80 #include <cam/cam_debug.h> 81 #include <cam/cam_sim.h> 82 #include <cam/cam_xpt_sim.h> 83 #include <cam/cam_xpt_periph.h> 84 #include <cam/cam_periph.h> 85 #include <cam/cam_compat.h> 86 #include <cam/scsi/scsi_message.h> 87 88 #include "common/common.h" 89 #include "common/t4_msg.h" 90 #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ 91 #include "tom/t4_tom.h" 92 #include "cxgbei.h" 93 94 static int worker_thread_count; 95 static struct cxgbei_worker_thread_softc *cwt_softc; 96 static struct proc *cxgbei_proc; 97 98 /* XXXNP some header instead. */ 99 struct icl_pdu *icl_cxgbei_new_pdu(int); 100 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); 101 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *); 102 103 static void 104 free_ci_counters(struct cxgbei_data *ci) 105 { 106 107 #define FREE_CI_COUNTER(x) do { \ 108 if (ci->x != NULL) { \ 109 counter_u64_free(ci->x); \ 110 ci->x = NULL; \ 111 } \ 112 } while (0) 113 114 FREE_CI_COUNTER(ddp_setup_ok); 115 FREE_CI_COUNTER(ddp_setup_error); 116 FREE_CI_COUNTER(ddp_bytes); 117 FREE_CI_COUNTER(ddp_pdus); 118 FREE_CI_COUNTER(fl_bytes); 119 FREE_CI_COUNTER(fl_pdus); 120 #undef FREE_CI_COUNTER 121 } 122 123 static int 124 alloc_ci_counters(struct cxgbei_data *ci) 125 { 126 127 #define ALLOC_CI_COUNTER(x) do { \ 128 ci->x = counter_u64_alloc(M_WAITOK); \ 129 if (ci->x == NULL) \ 130 goto fail; \ 131 } while (0) 132 133 ALLOC_CI_COUNTER(ddp_setup_ok); 134 ALLOC_CI_COUNTER(ddp_setup_error); 135 ALLOC_CI_COUNTER(ddp_bytes); 136 ALLOC_CI_COUNTER(ddp_pdus); 137 ALLOC_CI_COUNTER(fl_bytes); 138 ALLOC_CI_COUNTER(fl_pdus); 139 #undef ALLOC_CI_COUNTER 140 141 return (0); 142 fail: 143 free_ci_counters(ci); 144 return (ENOMEM); 145 } 146 147 static void 148 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len, 149 uint32_t *max_rx_pdu_len) 150 { 151 uint32_t tx_len, rx_len, r, v; 152 153 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); 154 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); 155 156 r = t4_read_reg(sc, A_TP_PARA_REG2); 157 rx_len = min(rx_len, G_MAXRXDATA(r)); 158 tx_len = min(tx_len, G_MAXRXDATA(r)); 159 160 r = t4_read_reg(sc, A_TP_PARA_REG7); 161 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); 162 rx_len = min(rx_len, v); 163 tx_len = min(tx_len, v); 164 165 /* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */ 166 tx_len = min(tx_len, 3 * 4096); 167 168 *max_tx_pdu_len = rounddown2(tx_len, 512); 169 *max_rx_pdu_len = rounddown2(rx_len, 512); 170 } 171 172 /* 173 * Initialize the software state of the iSCSI ULP driver. 174 * 175 * ENXIO means firmware didn't set up something that it was supposed to. 176 */ 177 static int 178 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) 179 { 180 struct sysctl_oid *oid; 181 struct sysctl_oid_list *children; 182 struct ppod_region *pr; 183 uint32_t r; 184 int rc; 185 186 MPASS(sc->vres.iscsi.size > 0); 187 MPASS(ci != NULL); 188 189 rc = alloc_ci_counters(ci); 190 if (rc != 0) 191 return (rc); 192 193 read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len); 194 195 pr = &ci->pr; 196 r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); 197 rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); 198 if (rc != 0) { 199 device_printf(sc->dev, 200 "%s: failed to initialize the iSCSI page pod region: %u.\n", 201 __func__, rc); 202 free_ci_counters(ci); 203 return (rc); 204 } 205 206 r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); 207 r &= V_ISCSITAGMASK(M_ISCSITAGMASK); 208 if (r != pr->pr_tag_mask) { 209 /* 210 * Recent firmwares are supposed to set up the iSCSI tagmask 211 * but we'll do it ourselves it the computed value doesn't match 212 * what's in the register. 213 */ 214 device_printf(sc->dev, 215 "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, 216 pr->pr_tag_mask); 217 t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, 218 V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); 219 } 220 221 sysctl_ctx_init(&ci->ctx); 222 oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ 223 children = SYSCTL_CHILDREN(oid); 224 225 oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD, 226 NULL, "iSCSI ULP statistics"); 227 children = SYSCTL_CHILDREN(oid); 228 229 SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok", 230 CTLFLAG_RD, &ci->ddp_setup_ok, 231 "# of times DDP buffer was setup successfully."); 232 233 SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error", 234 CTLFLAG_RD, &ci->ddp_setup_error, 235 "# of times DDP buffer setup failed."); 236 237 SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes", 238 CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly"); 239 240 SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus", 241 CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly."); 242 243 SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes", 244 CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist"); 245 246 SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus", 247 CTLFLAG_RD, &ci->fl_pdus, 248 "# of PDUs with data delivered in freelist"); 249 250 ci->ddp_threshold = 2048; 251 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", 252 CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); 253 254 return (0); 255 } 256 257 static int 258 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 259 { 260 struct adapter *sc = iq->adapter; 261 struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); 262 u_int tid = GET_TID(cpl); 263 struct toepcb *toep = lookup_tid(sc, tid); 264 struct icl_pdu *ip; 265 struct icl_cxgbei_pdu *icp; 266 uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); 267 uint16_t len = be16toh(cpl->len); 268 269 M_ASSERTPKTHDR(m); 270 MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); 271 272 ip = icl_cxgbei_new_pdu(M_NOWAIT); 273 if (ip == NULL) 274 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 275 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); 276 ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; 277 icp = ip_to_icp(ip); 278 icp->icp_seq = ntohl(cpl->seq); 279 icp->icp_flags = ICPF_RX_HDR; 280 281 /* This is the start of a new PDU. There should be no old state. */ 282 MPASS(toep->ulpcb2 == NULL); 283 toep->ulpcb2 = icp; 284 285 #if 0 286 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", 287 __func__, tid, len, len_ddp, icp); 288 #endif 289 290 m_freem(m); 291 return (0); 292 } 293 294 static int 295 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 296 { 297 struct adapter *sc = iq->adapter; 298 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 299 struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); 300 u_int tid = GET_TID(cpl); 301 struct toepcb *toep = lookup_tid(sc, tid); 302 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 303 304 M_ASSERTPKTHDR(m); 305 MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); 306 307 /* Must already have received the header (but not the data). */ 308 MPASS(icp != NULL); 309 MPASS(icp->icp_flags == ICPF_RX_HDR); 310 MPASS(icp->ip.ip_data_mbuf == NULL); 311 312 313 m_adj(m, sizeof(*cpl)); 314 MPASS(icp->ip.ip_data_len == m->m_pkthdr.len); 315 316 icp->icp_flags |= ICPF_RX_FLBUF; 317 icp->ip.ip_data_mbuf = m; 318 counter_u64_add(ci->fl_pdus, 1); 319 counter_u64_add(ci->fl_bytes, m->m_pkthdr.len); 320 321 #if 0 322 CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid, 323 be16toh(cpl->len)); 324 #endif 325 326 return (0); 327 } 328 329 static int 330 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 331 { 332 struct adapter *sc = iq->adapter; 333 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 334 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 335 u_int tid = GET_TID(cpl); 336 struct toepcb *toep = lookup_tid(sc, tid); 337 struct inpcb *inp = toep->inp; 338 struct socket *so; 339 struct sockbuf *sb; 340 struct tcpcb *tp; 341 struct icl_cxgbei_conn *icc; 342 struct icl_conn *ic; 343 struct icl_cxgbei_pdu *icp = toep->ulpcb2; 344 struct icl_pdu *ip; 345 u_int pdu_len, val; 346 347 MPASS(m == NULL); 348 349 /* Must already be assembling a PDU. */ 350 MPASS(icp != NULL); 351 MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ 352 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); 353 354 pdu_len = be16toh(cpl->len); /* includes everything. */ 355 val = be32toh(cpl->ddpvld); 356 357 #if 0 358 CTR5(KTR_CXGBE, 359 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", 360 __func__, tid, pdu_len, val, icp->icp_flags); 361 #endif 362 363 icp->icp_flags |= ICPF_RX_STATUS; 364 ip = &icp->ip; 365 if (val & F_DDP_PADDING_ERR) 366 icp->icp_flags |= ICPF_PAD_ERR; 367 if (val & F_DDP_HDRCRC_ERR) 368 icp->icp_flags |= ICPF_HCRC_ERR; 369 if (val & F_DDP_DATACRC_ERR) 370 icp->icp_flags |= ICPF_DCRC_ERR; 371 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { 372 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); 373 MPASS(ip->ip_data_len > 0); 374 icp->icp_flags |= ICPF_RX_DDP; 375 counter_u64_add(ci->ddp_pdus, 1); 376 counter_u64_add(ci->ddp_bytes, ip->ip_data_len); 377 } 378 379 INP_WLOCK(inp); 380 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 381 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 382 __func__, tid, pdu_len, inp->inp_flags); 383 INP_WUNLOCK(inp); 384 icl_cxgbei_conn_pdu_free(NULL, ip); 385 #ifdef INVARIANTS 386 toep->ulpcb2 = NULL; 387 #endif 388 return (0); 389 } 390 391 tp = intotcpcb(inp); 392 MPASS(icp->icp_seq == tp->rcv_nxt); 393 MPASS(tp->rcv_wnd >= pdu_len); 394 tp->rcv_nxt += pdu_len; 395 tp->rcv_wnd -= pdu_len; 396 tp->t_rcvtime = ticks; 397 398 /* update rx credits */ 399 toep->rx_credits += pdu_len; 400 t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */ 401 402 so = inp->inp_socket; 403 sb = &so->so_rcv; 404 SOCKBUF_LOCK(sb); 405 406 icc = toep->ulpcb; 407 if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { 408 CTR5(KTR_CXGBE, 409 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", 410 __func__, tid, pdu_len, icc, sb->sb_state); 411 SOCKBUF_UNLOCK(sb); 412 INP_WUNLOCK(inp); 413 414 INP_INFO_RLOCK(&V_tcbinfo); 415 INP_WLOCK(inp); 416 tp = tcp_drop(tp, ECONNRESET); 417 if (tp) 418 INP_WUNLOCK(inp); 419 INP_INFO_RUNLOCK(&V_tcbinfo); 420 421 icl_cxgbei_conn_pdu_free(NULL, ip); 422 #ifdef INVARIANTS 423 toep->ulpcb2 = NULL; 424 #endif 425 return (0); 426 } 427 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 428 ic = &icc->ic; 429 icl_cxgbei_new_pdu_set_conn(ip, ic); 430 431 MPASS(m == NULL); /* was unused, we'll use it now. */ 432 m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */ 433 if (__predict_false(m != NULL)) { 434 int len = m_length(m, NULL); 435 436 /* 437 * PDUs were received before the tid transitioned to ULP mode. 438 * Convert them to icl_cxgbei_pdus and send them to ICL before 439 * the PDU in icp/ip. 440 */ 441 CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid, 442 len); 443 444 /* XXXNP: needs to be rewritten. */ 445 if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct 446 iscsi_bhs)) { 447 struct icl_cxgbei_pdu *icp0; 448 struct icl_pdu *ip0; 449 450 ip0 = icl_cxgbei_new_pdu(M_NOWAIT); 451 icl_cxgbei_new_pdu_set_conn(ip0, ic); 452 if (ip0 == NULL) 453 CXGBE_UNIMPLEMENTED("PDU allocation failure"); 454 icp0 = ip_to_icp(ip0); 455 icp0->icp_seq = 0; /* XXX */ 456 icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS; 457 m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs); 458 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next); 459 } 460 m_freem(m); 461 } 462 463 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); 464 if ((icc->rx_flags & RXF_ACTIVE) == 0) { 465 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; 466 467 mtx_lock(&cwt->cwt_lock); 468 icc->rx_flags |= RXF_ACTIVE; 469 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); 470 if (cwt->cwt_state == CWT_SLEEPING) { 471 cwt->cwt_state = CWT_RUNNING; 472 cv_signal(&cwt->cwt_cv); 473 } 474 mtx_unlock(&cwt->cwt_lock); 475 } 476 SOCKBUF_UNLOCK(sb); 477 INP_WUNLOCK(inp); 478 479 #ifdef INVARIANTS 480 toep->ulpcb2 = NULL; 481 #endif 482 483 return (0); 484 } 485 486 static int 487 cxgbei_activate(struct adapter *sc) 488 { 489 struct cxgbei_data *ci; 490 int rc; 491 492 ASSERT_SYNCHRONIZED_OP(sc); 493 494 if (uld_active(sc, ULD_ISCSI)) { 495 KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", 496 __func__, sc)); 497 return (0); 498 } 499 500 if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { 501 device_printf(sc->dev, 502 "not iSCSI offload capable, or capability disabled.\n"); 503 return (ENOSYS); 504 } 505 506 /* per-adapter softc for iSCSI */ 507 ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); 508 if (ci == NULL) 509 return (ENOMEM); 510 511 rc = cxgbei_init(sc, ci); 512 if (rc != 0) { 513 free(ci, M_CXGBE); 514 return (rc); 515 } 516 517 sc->iscsi_ulp_softc = ci; 518 519 return (0); 520 } 521 522 static int 523 cxgbei_deactivate(struct adapter *sc) 524 { 525 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 526 527 ASSERT_SYNCHRONIZED_OP(sc); 528 529 if (ci != NULL) { 530 sysctl_ctx_free(&ci->ctx); 531 t4_free_ppod_region(&ci->pr); 532 free_ci_counters(ci); 533 free(ci, M_CXGBE); 534 sc->iscsi_ulp_softc = NULL; 535 } 536 537 return (0); 538 } 539 540 static void 541 cxgbei_activate_all(struct adapter *sc, void *arg __unused) 542 { 543 544 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) 545 return; 546 547 /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ 548 if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) 549 (void) t4_activate_uld(sc, ULD_ISCSI); 550 551 end_synchronized_op(sc, 0); 552 } 553 554 static void 555 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) 556 { 557 558 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) 559 return; 560 561 if (uld_active(sc, ULD_ISCSI)) 562 (void) t4_deactivate_uld(sc, ULD_ISCSI); 563 564 end_synchronized_op(sc, 0); 565 } 566 567 static struct uld_info cxgbei_uld_info = { 568 .uld_id = ULD_ISCSI, 569 .activate = cxgbei_activate, 570 .deactivate = cxgbei_deactivate, 571 }; 572 573 static void 574 cwt_main(void *arg) 575 { 576 struct cxgbei_worker_thread_softc *cwt = arg; 577 struct icl_cxgbei_conn *icc = NULL; 578 struct icl_conn *ic; 579 struct icl_pdu *ip; 580 struct sockbuf *sb; 581 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); 582 583 MPASS(cwt != NULL); 584 585 mtx_lock(&cwt->cwt_lock); 586 MPASS(cwt->cwt_state == 0); 587 cwt->cwt_state = CWT_RUNNING; 588 cv_signal(&cwt->cwt_cv); 589 590 while (__predict_true(cwt->cwt_state != CWT_STOP)) { 591 cwt->cwt_state = CWT_RUNNING; 592 while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { 593 TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); 594 mtx_unlock(&cwt->cwt_lock); 595 596 ic = &icc->ic; 597 sb = &ic->ic_socket->so_rcv; 598 599 SOCKBUF_LOCK(sb); 600 MPASS(icc->rx_flags & RXF_ACTIVE); 601 if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { 602 MPASS(STAILQ_EMPTY(&rx_pdus)); 603 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); 604 SOCKBUF_UNLOCK(sb); 605 606 /* Hand over PDUs to ICL. */ 607 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { 608 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); 609 ic->ic_receive(ip); 610 } 611 612 SOCKBUF_LOCK(sb); 613 MPASS(STAILQ_EMPTY(&rx_pdus)); 614 } 615 MPASS(icc->rx_flags & RXF_ACTIVE); 616 if (STAILQ_EMPTY(&icc->rcvd_pdus) || 617 __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 618 icc->rx_flags &= ~RXF_ACTIVE; 619 } else { 620 /* 621 * More PDUs were received while we were busy 622 * handing over the previous batch to ICL. 623 * Re-add this connection to the end of the 624 * queue. 625 */ 626 mtx_lock(&cwt->cwt_lock); 627 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, 628 rx_link); 629 mtx_unlock(&cwt->cwt_lock); 630 } 631 SOCKBUF_UNLOCK(sb); 632 633 mtx_lock(&cwt->cwt_lock); 634 } 635 636 /* Inner loop doesn't check for CWT_STOP, do that first. */ 637 if (__predict_false(cwt->cwt_state == CWT_STOP)) 638 break; 639 cwt->cwt_state = CWT_SLEEPING; 640 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 641 } 642 643 MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); 644 mtx_assert(&cwt->cwt_lock, MA_OWNED); 645 cwt->cwt_state = CWT_STOPPED; 646 cv_signal(&cwt->cwt_cv); 647 mtx_unlock(&cwt->cwt_lock); 648 kthread_exit(); 649 } 650 651 static int 652 start_worker_threads(void) 653 { 654 int i, rc; 655 struct cxgbei_worker_thread_softc *cwt; 656 657 worker_thread_count = min(mp_ncpus, 32); 658 cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, 659 M_WAITOK | M_ZERO); 660 661 MPASS(cxgbei_proc == NULL); 662 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 663 mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); 664 cv_init(&cwt->cwt_cv, "cwt cv"); 665 TAILQ_INIT(&cwt->rx_head); 666 rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, 667 "cxgbei", "%d", i); 668 if (rc != 0) { 669 printf("cxgbei: failed to start thread #%d/%d (%d)\n", 670 i + 1, worker_thread_count, rc); 671 mtx_destroy(&cwt->cwt_lock); 672 cv_destroy(&cwt->cwt_cv); 673 bzero(&cwt, sizeof(*cwt)); 674 if (i == 0) { 675 free(cwt_softc, M_CXGBE); 676 worker_thread_count = 0; 677 678 return (rc); 679 } 680 681 /* Not fatal, carry on with fewer threads. */ 682 worker_thread_count = i; 683 rc = 0; 684 break; 685 } 686 687 /* Wait for thread to start before moving on to the next one. */ 688 mtx_lock(&cwt->cwt_lock); 689 while (cwt->cwt_state == 0) 690 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 691 mtx_unlock(&cwt->cwt_lock); 692 } 693 694 MPASS(cwt_softc != NULL); 695 MPASS(worker_thread_count > 0); 696 return (0); 697 } 698 699 static void 700 stop_worker_threads(void) 701 { 702 int i; 703 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; 704 705 MPASS(worker_thread_count >= 0); 706 707 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { 708 mtx_lock(&cwt->cwt_lock); 709 MPASS(cwt->cwt_state == CWT_RUNNING || 710 cwt->cwt_state == CWT_SLEEPING); 711 cwt->cwt_state = CWT_STOP; 712 cv_signal(&cwt->cwt_cv); 713 do { 714 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); 715 } while (cwt->cwt_state != CWT_STOPPED); 716 mtx_unlock(&cwt->cwt_lock); 717 } 718 free(cwt_softc, M_CXGBE); 719 } 720 721 /* Select a worker thread for a connection. */ 722 u_int 723 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc) 724 { 725 struct adapter *sc = icc->sc; 726 struct toepcb *toep = icc->toep; 727 u_int i, n; 728 729 n = worker_thread_count / sc->sge.nofldrxq; 730 if (n > 0) 731 i = toep->vi->pi->port_id * n + arc4random() % n; 732 else 733 i = arc4random() % worker_thread_count; 734 735 CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i); 736 737 return (i); 738 } 739 740 static int 741 cxgbei_mod_load(void) 742 { 743 int rc; 744 745 t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr); 746 t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data); 747 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); 748 749 rc = start_worker_threads(); 750 if (rc != 0) 751 return (rc); 752 753 rc = t4_register_uld(&cxgbei_uld_info); 754 if (rc != 0) { 755 stop_worker_threads(); 756 return (rc); 757 } 758 759 t4_iterate(cxgbei_activate_all, NULL); 760 761 return (rc); 762 } 763 764 static int 765 cxgbei_mod_unload(void) 766 { 767 768 t4_iterate(cxgbei_deactivate_all, NULL); 769 770 if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) 771 return (EBUSY); 772 773 stop_worker_threads(); 774 775 t4_register_cpl_handler(CPL_ISCSI_HDR, NULL); 776 t4_register_cpl_handler(CPL_ISCSI_DATA, NULL); 777 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL); 778 779 return (0); 780 } 781 #endif 782 783 static int 784 cxgbei_modevent(module_t mod, int cmd, void *arg) 785 { 786 int rc = 0; 787 788 #ifdef TCP_OFFLOAD 789 switch (cmd) { 790 case MOD_LOAD: 791 rc = cxgbei_mod_load(); 792 if (rc == 0) 793 rc = icl_cxgbei_mod_load(); 794 break; 795 796 case MOD_UNLOAD: 797 rc = icl_cxgbei_mod_unload(); 798 if (rc == 0) 799 rc = cxgbei_mod_unload(); 800 break; 801 802 default: 803 rc = EINVAL; 804 } 805 #else 806 printf("cxgbei: compiled without TCP_OFFLOAD support.\n"); 807 rc = EOPNOTSUPP; 808 #endif 809 810 return (rc); 811 } 812 813 static moduledata_t cxgbei_mod = { 814 "cxgbei", 815 cxgbei_modevent, 816 NULL, 817 }; 818 819 MODULE_VERSION(cxgbei, 1); 820 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); 821 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); 822 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); 823 MODULE_DEPEND(cxgbei, icl, 1, 1, 1); 824