1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 34 #ifdef TCP_OFFLOAD 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/kernel.h> 38 #include <sys/ktr.h> 39 #include <sys/module.h> 40 #include <sys/protosw.h> 41 #include <sys/refcount.h> 42 #include <sys/domain.h> 43 #include <sys/fnv_hash.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <net/ethernet.h> 48 #include <net/if.h> 49 #include <net/if_types.h> 50 #include <net/if_vlan_var.h> 51 #include <net/route.h> 52 #include <net/route/nhop.h> 53 #include <netinet/in.h> 54 #include <netinet/in_fib.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet6/in6_fib.h> 59 #include <netinet6/scope6_var.h> 60 #include <netinet/tcp_timer.h> 61 #define TCPSTATES 62 #include <netinet/tcp_fsm.h> 63 #include <netinet/tcp_var.h> 64 #include <netinet/toecore.h> 65 #include <netinet/cc/cc.h> 66 67 #include "common/common.h" 68 #include "common/t4_msg.h" 69 #include "common/t4_regs.h" 70 #include "t4_clip.h" 71 #include "tom/t4_tom_l2t.h" 72 #include "tom/t4_tom.h" 73 74 /* stid services */ 75 static int alloc_stid(struct adapter *, bool, void *); 76 static struct listen_ctx *lookup_stid(struct adapter *, int); 77 static void free_stid(struct adapter *, int , bool); 78 79 /* lctx services */ 80 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 81 struct vi_info *); 82 static int free_lctx(struct adapter *, struct listen_ctx *); 83 static void hold_lctx(struct listen_ctx *); 84 static void listen_hash_add(struct adapter *, struct listen_ctx *); 85 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 86 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 87 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 88 89 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); 90 91 static int create_server6(struct adapter *, struct listen_ctx *); 92 static int create_server(struct adapter *, struct listen_ctx *); 93 94 int 95 alloc_stid_tab(struct adapter *sc) 96 { 97 struct tid_info *t = &sc->tids; 98 99 MPASS(t->nstids > 0); 100 MPASS(t->stid_tab == NULL); 101 102 t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, 103 M_ZERO | M_NOWAIT); 104 if (t->stid_tab == NULL) 105 return (ENOMEM); 106 t->stid_bitmap = bit_alloc(t->nstids, M_CXGBE, M_NOWAIT); 107 if (t->stid_bitmap == NULL) { 108 free(t->stid_tab, M_CXGBE); 109 t->stid_tab = NULL; 110 return (ENOMEM); 111 } 112 mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); 113 t->stids_in_use = 0; 114 115 return (0); 116 } 117 118 void 119 free_stid_tab(struct adapter *sc) 120 { 121 struct tid_info *t = &sc->tids; 122 123 KASSERT(t->stids_in_use == 0, 124 ("%s: %d tids still in use.", __func__, t->stids_in_use)); 125 126 if (mtx_initialized(&t->stid_lock)) 127 mtx_destroy(&t->stid_lock); 128 free(t->stid_tab, M_CXGBE); 129 t->stid_tab = NULL; 130 free(t->stid_bitmap, M_CXGBE); 131 t->stid_bitmap = NULL; 132 } 133 134 void 135 stop_stid_tab(struct adapter *sc) 136 { 137 struct tid_info *t = &sc->tids; 138 struct tom_data *td = sc->tom_softc; 139 struct listen_ctx *lctx; 140 struct synq_entry *synqe; 141 int i, ntids; 142 143 mtx_lock(&t->stid_lock); 144 t->stid_tab_stopped = true; 145 mtx_unlock(&t->stid_lock); 146 147 mtx_lock(&td->lctx_hash_lock); 148 for (i = 0; i <= td->listen_mask; i++) { 149 LIST_FOREACH(lctx, &td->listen_hash[i], link) 150 lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW); 151 } 152 mtx_unlock(&td->lctx_hash_lock); 153 154 mtx_lock(&td->toep_list_lock); 155 TAILQ_FOREACH(synqe, &td->synqe_list, link) { 156 MPASS(sc->incarnation == synqe->incarnation); 157 MPASS(synqe->tid >= 0); 158 MPASS(synqe == lookup_tid(sc, synqe->tid)); 159 /* Remove tid from the lookup table immediately. */ 160 CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", 161 __func__, synqe->tid, synqe->incarnation); 162 ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1; 163 remove_tid(sc, synqe->tid, ntids); 164 #if 0 165 /* synqe->tid is stale now but left alone for debug. */ 166 synqe->tid = -1; 167 #endif 168 } 169 MPASS(TAILQ_EMPTY(&td->stranded_synqe)); 170 TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link); 171 MPASS(TAILQ_EMPTY(&td->synqe_list)); 172 mtx_unlock(&td->toep_list_lock); 173 } 174 175 void 176 restart_stid_tab(struct adapter *sc) 177 { 178 struct tid_info *t = &sc->tids; 179 struct tom_data *td = sc->tom_softc; 180 struct listen_ctx *lctx; 181 int i; 182 183 mtx_lock(&td->lctx_hash_lock); 184 for (i = 0; i <= td->listen_mask; i++) { 185 LIST_FOREACH(lctx, &td->listen_hash[i], link) { 186 MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0); 187 lctx->flags |= LCTX_RPL_PENDING; 188 if (lctx->inp->inp_vflag & INP_IPV6) 189 create_server6(sc, lctx); 190 else 191 create_server(sc, lctx); 192 } 193 } 194 mtx_unlock(&td->lctx_hash_lock); 195 196 mtx_lock(&t->stid_lock); 197 t->stid_tab_stopped = false; 198 mtx_unlock(&t->stid_lock); 199 200 } 201 202 static int 203 alloc_stid(struct adapter *sc, bool isipv6, void *ctx) 204 { 205 struct tid_info *t = &sc->tids; 206 const u_int n = isipv6 ? 2 : 1; 207 int stid, pair_stid; 208 u_int i; 209 ssize_t val; 210 211 mtx_lock(&t->stid_lock); 212 MPASS(t->stids_in_use <= t->nstids); 213 if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) { 214 mtx_unlock(&t->stid_lock); 215 return (-1); 216 } 217 218 stid = -1; 219 if (isipv6) { 220 /* 221 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 222 * cells) in the TCAM. We know that the start of the stid 223 * region is properly aligned already (the chip requires each 224 * region to be 128-cell aligned). 225 */ 226 for (i = 0; i + 1 < t->nstids; i = roundup2(val + 1, 2)) { 227 bit_ffc_area_at(t->stid_bitmap, i, t->nstids, 2, &val); 228 if (val == -1) 229 break; 230 if ((val & 1) == 0) { 231 stid = val; 232 break; 233 } 234 } 235 } else { 236 /* 237 * An IPv4 server needs one stid without any alignment 238 * requirements. But we try extra hard to find an available 239 * stid adjacent to a used stid so that free "stid-pairs" are 240 * left intact for IPv6. 241 */ 242 bit_ffc_at(t->stid_bitmap, 0, t->nstids, &val); 243 while (val != -1) { 244 if (stid == -1) { 245 /* 246 * First usable stid. Look no further if it's 247 * an ideal fit. 248 */ 249 stid = val; 250 if (val & 1 || bit_test(t->stid_bitmap, val + 1)) 251 break; 252 } else { 253 /* 254 * We have an unused stid already but are now 255 * looking for in-use stids because we'd prefer 256 * to grab an unused stid adjacent to one that's 257 * in use. 258 * 259 * Odd stids pair with the previous stid and 260 * even ones pair with the next stid. 261 */ 262 pair_stid = val & 1 ? val - 1 : val + 1; 263 if (bit_test(t->stid_bitmap, pair_stid) == 0) { 264 stid = pair_stid; 265 break; 266 } 267 } 268 val = roundup2(val + 1, 2); 269 if (val >= t->nstids) 270 break; 271 bit_ffs_at(t->stid_bitmap, val, t->nstids, &val); 272 } 273 } 274 275 if (stid >= 0) { 276 MPASS(stid + n - 1 < t->nstids); 277 MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 0)); 278 bit_nset(t->stid_bitmap, stid, stid + n - 1); 279 t->stids_in_use += n; 280 t->stid_tab[stid] = ctx; 281 #ifdef INVARIANTS 282 if (n == 2) { 283 MPASS((stid & 1) == 0); 284 t->stid_tab[stid + 1] = NULL; 285 } 286 #endif 287 stid += t->stid_base; 288 } 289 mtx_unlock(&t->stid_lock); 290 return (stid); 291 } 292 293 static struct listen_ctx * 294 lookup_stid(struct adapter *sc, int stid) 295 { 296 struct tid_info *t = &sc->tids; 297 298 return (t->stid_tab[stid - t->stid_base]); 299 } 300 301 static void 302 free_stid(struct adapter *sc, int stid, bool isipv6) 303 { 304 struct tid_info *t = &sc->tids; 305 const u_int n = isipv6 ? 2 : 1; 306 307 mtx_lock(&t->stid_lock); 308 MPASS(stid >= t->stid_base); 309 stid -= t->stid_base; 310 MPASS(stid + n - 1 < t->nstids); 311 MPASS(t->stids_in_use <= t->nstids); 312 MPASS(t->stids_in_use >= n); 313 MPASS(t->stid_tab[stid] != NULL); 314 #ifdef INVARIANTS 315 if (n == 2) { 316 MPASS((stid & 1) == 0); 317 MPASS(t->stid_tab[stid + 1] == NULL); 318 } 319 #endif 320 MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 1)); 321 bit_nclear(t->stid_bitmap, stid, stid + n - 1); 322 t->stid_tab[stid] = NULL; 323 t->stids_in_use -= n; 324 mtx_unlock(&t->stid_lock); 325 } 326 327 static struct listen_ctx * 328 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 329 { 330 struct listen_ctx *lctx; 331 332 INP_WLOCK_ASSERT(inp); 333 334 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 335 if (lctx == NULL) 336 return (NULL); 337 338 lctx->isipv6 = inp->inp_vflag & INP_IPV6; 339 lctx->stid = alloc_stid(sc, lctx->isipv6, lctx); 340 if (lctx->stid < 0) { 341 free(lctx, M_CXGBE); 342 return (NULL); 343 } 344 345 if (lctx->isipv6 && 346 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 347 lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); 348 if (lctx->ce == NULL) { 349 free(lctx, M_CXGBE); 350 return (NULL); 351 } 352 } 353 354 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 355 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 356 refcount_init(&lctx->refcount, 1); 357 358 lctx->inp = inp; 359 lctx->vnet = inp->inp_socket->so_vnet; 360 in_pcbref(inp); 361 362 return (lctx); 363 } 364 365 /* Don't call this directly, use release_lctx instead */ 366 static int 367 free_lctx(struct adapter *sc, struct listen_ctx *lctx) 368 { 369 struct inpcb *inp = lctx->inp; 370 371 INP_WLOCK_ASSERT(inp); 372 KASSERT(lctx->refcount == 0, 373 ("%s: refcount %d", __func__, lctx->refcount)); 374 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 375 376 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 377 __func__, lctx->stid, lctx, lctx->inp); 378 379 if (lctx->ce) 380 t4_release_clip_entry(sc, lctx->ce); 381 free_stid(sc, lctx->stid, lctx->isipv6); 382 free(lctx, M_CXGBE); 383 384 return (in_pcbrele_wlocked(inp)); 385 } 386 387 static void 388 hold_lctx(struct listen_ctx *lctx) 389 { 390 391 refcount_acquire(&lctx->refcount); 392 } 393 394 static inline uint32_t 395 listen_hashfn(void *key, u_long mask) 396 { 397 398 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 399 } 400 401 /* 402 * Add a listen_ctx entry to the listen hash table. 403 */ 404 static void 405 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 406 { 407 struct tom_data *td = sc->tom_softc; 408 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 409 410 mtx_lock(&td->lctx_hash_lock); 411 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 412 td->lctx_count++; 413 mtx_unlock(&td->lctx_hash_lock); 414 } 415 416 /* 417 * Look for the listening socket's context entry in the hash and return it. 418 */ 419 static struct listen_ctx * 420 listen_hash_find(struct adapter *sc, struct inpcb *inp) 421 { 422 struct tom_data *td = sc->tom_softc; 423 int bucket = listen_hashfn(inp, td->listen_mask); 424 struct listen_ctx *lctx; 425 426 mtx_lock(&td->lctx_hash_lock); 427 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 428 if (lctx->inp == inp) 429 break; 430 } 431 mtx_unlock(&td->lctx_hash_lock); 432 433 return (lctx); 434 } 435 436 /* 437 * Removes the listen_ctx structure for inp from the hash and returns it. 438 */ 439 static struct listen_ctx * 440 listen_hash_del(struct adapter *sc, struct inpcb *inp) 441 { 442 struct tom_data *td = sc->tom_softc; 443 int bucket = listen_hashfn(inp, td->listen_mask); 444 struct listen_ctx *lctx, *l; 445 446 mtx_lock(&td->lctx_hash_lock); 447 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 448 if (lctx->inp == inp) { 449 LIST_REMOVE(lctx, link); 450 td->lctx_count--; 451 break; 452 } 453 } 454 mtx_unlock(&td->lctx_hash_lock); 455 456 return (lctx); 457 } 458 459 /* 460 * Releases a hold on the lctx. Must be called with the listening socket's inp 461 * locked. The inp may be freed by this function and it returns NULL to 462 * indicate this. 463 */ 464 static struct inpcb * 465 release_lctx(struct adapter *sc, struct listen_ctx *lctx) 466 { 467 struct inpcb *inp = lctx->inp; 468 int inp_freed = 0; 469 470 INP_WLOCK_ASSERT(inp); 471 if (refcount_release(&lctx->refcount)) 472 inp_freed = free_lctx(sc, lctx); 473 474 return (inp_freed ? NULL : inp); 475 } 476 477 static void 478 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) 479 { 480 struct mbuf *m = synqe->syn; 481 if_t ifp = m->m_pkthdr.rcvif; 482 struct vi_info *vi = if_getsoftc(ifp); 483 struct port_info *pi = vi->pi; 484 struct wrqe *wr; 485 struct fw_flowc_wr *flowc; 486 struct sge_ofld_txq *ofld_txq; 487 struct sge_ofld_rxq *ofld_rxq; 488 const int nparams = 6; 489 const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 490 const u_int pfvf = sc->pf << S_FW_VIID_PFN; 491 492 INP_WLOCK_ASSERT(synqe->lctx->inp); 493 MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); 494 495 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 496 ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; 497 498 wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); 499 if (wr == NULL) { 500 /* XXX */ 501 panic("%s: allocation failure.", __func__); 502 } 503 flowc = wrtod(wr); 504 memset(flowc, 0, wr->wr_len); 505 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 506 V_FW_FLOWC_WR_NPARAMS(nparams)); 507 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 508 V_FW_WR_FLOWID(synqe->tid)); 509 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 510 flowc->mnemval[0].val = htobe32(pfvf); 511 /* Firmware expects hw port and will translate to channel itself. */ 512 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 513 flowc->mnemval[1].val = htobe32(pi->hw_port); 514 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 515 flowc->mnemval[2].val = htobe32(pi->hw_port); 516 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 517 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 518 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 519 flowc->mnemval[4].val = htobe32(512); 520 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 521 flowc->mnemval[5].val = htobe32(512); 522 523 synqe->flags |= TPF_FLOWC_WR_SENT; 524 t4_wrq_tx(sc, wr); 525 } 526 527 static void 528 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, 529 int rst_status) 530 { 531 struct adapter *sc = tod->tod_softc; 532 struct wrqe *wr; 533 struct cpl_abort_req *req; 534 535 INP_WLOCK_ASSERT(synqe->lctx->inp); 536 537 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 538 __func__, synqe, synqe->flags, synqe->tid, 539 synqe->flags & TPF_ABORT_SHUTDOWN ? 540 " (abort already in progress)" : ""); 541 if (synqe->flags & TPF_ABORT_SHUTDOWN) 542 return; /* abort already in progress */ 543 synqe->flags |= TPF_ABORT_SHUTDOWN; 544 545 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 546 send_flowc_wr_synqe(sc, synqe); 547 548 wr = alloc_wrqe(sizeof(*req), 549 &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); 550 if (wr == NULL) { 551 /* XXX */ 552 panic("%s: allocation failure.", __func__); 553 } 554 req = wrtod(wr); 555 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 556 req->rsvd0 = 0; /* don't have a snd_nxt */ 557 req->rsvd1 = 1; /* no data sent yet */ 558 req->cmd = rst_status; 559 560 t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); 561 } 562 563 static int 564 create_server(struct adapter *sc, struct listen_ctx *lctx) 565 { 566 struct wrqe *wr; 567 struct cpl_pass_open_req *req; 568 struct inpcb *inp = lctx->inp; 569 570 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 571 if (wr == NULL) { 572 log(LOG_ERR, "%s: allocation failure", __func__); 573 return (ENOMEM); 574 } 575 req = wrtod(wr); 576 577 INIT_TP_WR(req, 0); 578 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 579 req->local_port = inp->inp_lport; 580 req->peer_port = 0; 581 req->local_ip = inp->inp_laddr.s_addr; 582 req->peer_ip = 0; 583 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 584 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 585 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 586 587 t4_wrq_tx(sc, wr); 588 return (0); 589 } 590 591 static int 592 create_server6(struct adapter *sc, struct listen_ctx *lctx) 593 { 594 struct wrqe *wr; 595 struct cpl_pass_open_req6 *req; 596 struct inpcb *inp = lctx->inp; 597 598 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 599 if (wr == NULL) { 600 log(LOG_ERR, "%s: allocation failure", __func__); 601 return (ENOMEM); 602 } 603 req = wrtod(wr); 604 605 INIT_TP_WR(req, 0); 606 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 607 req->local_port = inp->inp_lport; 608 req->peer_port = 0; 609 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 610 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 611 req->peer_ip_hi = 0; 612 req->peer_ip_lo = 0; 613 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 614 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 615 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 616 617 t4_wrq_tx(sc, wr); 618 return (0); 619 } 620 621 static int 622 destroy_server(struct adapter *sc, struct listen_ctx *lctx) 623 { 624 struct wrqe *wr; 625 struct cpl_close_listsvr_req *req; 626 627 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 628 if (wr == NULL) { 629 /* XXX */ 630 panic("%s: allocation failure.", __func__); 631 } 632 req = wrtod(wr); 633 634 INIT_TP_WR(req, 0); 635 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 636 lctx->stid)); 637 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 638 req->rsvd = htobe16(0); 639 640 t4_wrq_tx(sc, wr); 641 return (0); 642 } 643 644 /* 645 * Start a listening server by sending a passive open request to HW. 646 * 647 * Can't take adapter lock here and access to sc->flags, 648 * sc->offload_map, if_capenable are all race prone. 649 */ 650 int 651 t4_listen_start(struct toedev *tod, struct tcpcb *tp) 652 { 653 struct adapter *sc = tod->tod_softc; 654 struct vi_info *vi; 655 struct port_info *pi; 656 struct inpcb *inp = tptoinpcb(tp); 657 struct listen_ctx *lctx; 658 int i, rc, v; 659 struct offload_settings settings; 660 661 INP_WLOCK_ASSERT(inp); 662 663 rw_rlock(&sc->policy_lock); 664 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 665 EVL_MAKETAG(0xfff, 0, 0), inp); 666 rw_runlock(&sc->policy_lock); 667 if (!settings.offload) 668 return (0); 669 670 /* Don't start a hardware listener for any loopback address. */ 671 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 672 return (0); 673 if (!(inp->inp_vflag & INP_IPV6) && 674 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 675 return (0); 676 if (sc->flags & KERN_TLS_ON) 677 return (0); 678 #if 0 679 ADAPTER_LOCK(sc); 680 if (IS_BUSY(sc)) { 681 log(LOG_ERR, "%s: listen request ignored, %s is busy", 682 __func__, device_get_nameunit(sc->dev)); 683 goto done; 684 } 685 686 KASSERT(uld_active(sc, ULD_TOM), 687 ("%s: TOM not initialized", __func__)); 688 #endif 689 690 /* 691 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 692 * such VI's queues to send the passive open and receive the reply to 693 * it. 694 * 695 * XXX: need a way to mark a port in use by offload. if_cxgbe should 696 * then reject any attempt to bring down such a port (and maybe reject 697 * attempts to disable IFCAP_TOE on that port too?). 698 */ 699 for_each_port(sc, i) { 700 pi = sc->port[i]; 701 for_each_vi(pi, v, vi) { 702 if (vi->flags & VI_INIT_DONE && 703 if_getcapenable(vi->ifp) & IFCAP_TOE) 704 goto found; 705 } 706 } 707 goto done; /* no port that's UP with IFCAP_TOE enabled */ 708 found: 709 710 if (listen_hash_find(sc, inp) != NULL) 711 goto done; /* already setup */ 712 713 lctx = alloc_lctx(sc, inp, vi); 714 if (lctx == NULL) { 715 log(LOG_ERR, 716 "%s: listen request ignored, %s couldn't allocate lctx\n", 717 __func__, device_get_nameunit(sc->dev)); 718 goto done; 719 } 720 listen_hash_add(sc, lctx); 721 722 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 723 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 724 inp->inp_vflag); 725 726 if (inp->inp_vflag & INP_IPV6) 727 rc = create_server6(sc, lctx); 728 else 729 rc = create_server(sc, lctx); 730 if (rc != 0) { 731 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 732 __func__, device_get_nameunit(sc->dev), rc); 733 (void) listen_hash_del(sc, inp); 734 inp = release_lctx(sc, lctx); 735 /* can't be freed, host stack has a reference */ 736 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 737 goto done; 738 } 739 lctx->flags |= LCTX_RPL_PENDING; 740 done: 741 #if 0 742 ADAPTER_UNLOCK(sc); 743 #endif 744 return (0); 745 } 746 747 int 748 t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 749 { 750 struct listen_ctx *lctx; 751 struct adapter *sc = tod->tod_softc; 752 struct inpcb *inp = tptoinpcb(tp); 753 754 INP_WLOCK_ASSERT(inp); 755 756 lctx = listen_hash_del(sc, inp); 757 if (lctx == NULL) 758 return (ENOENT); /* no hardware listener for this inp */ 759 760 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 761 lctx, lctx->flags); 762 763 /* 764 * If the reply to the PASS_OPEN is still pending we'll wait for it to 765 * arrive and clean up when it does. 766 */ 767 if (lctx->flags & LCTX_RPL_PENDING) { 768 return (EINPROGRESS); 769 } 770 771 if (lctx->flags & LCTX_SETUP_IN_HW) 772 destroy_server(sc, lctx); 773 else 774 inp = release_lctx(sc, lctx); 775 return (0); 776 } 777 778 static inline struct synq_entry * 779 alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags) 780 { 781 struct synq_entry *synqe; 782 783 INP_RLOCK_ASSERT(lctx->inp); 784 MPASS(flags == M_WAITOK || flags == M_NOWAIT); 785 786 synqe = malloc(sizeof(*synqe), M_CXGBE, flags); 787 if (__predict_true(synqe != NULL)) { 788 synqe->flags = TPF_SYNQE; 789 synqe->incarnation = sc->incarnation; 790 refcount_init(&synqe->refcnt, 1); 791 synqe->lctx = lctx; 792 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ 793 synqe->syn = NULL; 794 } 795 796 return (synqe); 797 } 798 799 static inline void 800 hold_synqe(struct synq_entry *synqe) 801 { 802 803 refcount_acquire(&synqe->refcnt); 804 } 805 806 static inline struct inpcb * 807 release_synqe(struct adapter *sc, struct synq_entry *synqe) 808 { 809 struct inpcb *inp; 810 811 MPASS(synqe->flags & TPF_SYNQE); 812 MPASS(synqe->lctx != NULL); 813 814 inp = synqe->lctx->inp; 815 MPASS(inp != NULL); 816 INP_WLOCK_ASSERT(inp); 817 818 if (refcount_release(&synqe->refcnt)) { 819 inp = release_lctx(sc, synqe->lctx); 820 m_freem(synqe->syn); 821 free(synqe, M_CXGBE); 822 } 823 824 return (inp); 825 } 826 827 void 828 t4_syncache_added(struct toedev *tod __unused, void *arg) 829 { 830 struct synq_entry *synqe = arg; 831 832 hold_synqe(synqe); 833 } 834 835 void 836 t4_syncache_removed(struct toedev *tod, void *arg) 837 { 838 struct adapter *sc = tod->tod_softc; 839 struct synq_entry *synqe = arg; 840 struct inpcb *inp = synqe->lctx->inp; 841 842 /* 843 * XXX: this is a LOR but harmless when running from the softclock. 844 */ 845 INP_WLOCK(inp); 846 inp = release_synqe(sc, synqe); 847 if (inp != NULL) 848 INP_WUNLOCK(inp); 849 } 850 851 int 852 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 853 { 854 struct synq_entry *synqe = arg; 855 856 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { 857 struct tcpopt to; 858 struct ip *ip = mtod(m, struct ip *); 859 struct tcphdr *th; 860 861 if (ip->ip_v == IPVERSION) 862 th = (void *)(ip + 1); 863 else 864 th = (void *)((struct ip6_hdr *)ip + 1); 865 bzero(&to, sizeof(to)); 866 tcp_dooptions(&to, (void *)(th + 1), 867 (th->th_off << 2) - sizeof(*th), TO_SYN); 868 869 /* save these for later */ 870 synqe->iss = be32toh(th->th_seq); 871 synqe->irs = be32toh(th->th_ack) - 1; 872 synqe->ts = to.to_tsval; 873 } 874 875 m_freem(m); /* don't need this any more */ 876 return (0); 877 } 878 879 static int 880 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 881 struct mbuf *m) 882 { 883 struct adapter *sc = iq->adapter; 884 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 885 int stid = GET_TID(cpl); 886 unsigned int status = cpl->status; 887 struct listen_ctx *lctx = lookup_stid(sc, stid); 888 struct inpcb *inp = lctx->inp; 889 #ifdef INVARIANTS 890 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 891 #endif 892 893 KASSERT(opcode == CPL_PASS_OPEN_RPL, 894 ("%s: unexpected opcode 0x%x", __func__, opcode)); 895 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 896 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 897 898 INP_WLOCK(inp); 899 900 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 901 __func__, stid, status, lctx->flags); 902 903 lctx->flags &= ~LCTX_RPL_PENDING; 904 if (status == CPL_ERR_NONE) 905 lctx->flags |= LCTX_SETUP_IN_HW; 906 else 907 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 908 909 #ifdef INVARIANTS 910 /* 911 * If the inp has been dropped (listening socket closed) then 912 * listen_stop must have run and taken the inp out of the hash. 913 */ 914 if (inp->inp_flags & INP_DROPPED) { 915 KASSERT(listen_hash_del(sc, inp) == NULL, 916 ("%s: inp %p still in listen hash", __func__, inp)); 917 } 918 #endif 919 920 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 921 if (release_lctx(sc, lctx) != NULL) 922 INP_WUNLOCK(inp); 923 return (status); 924 } 925 926 /* 927 * Listening socket stopped listening earlier and now the chip tells us 928 * it has started the hardware listener. Stop it; the lctx will be 929 * released in do_close_server_rpl. 930 */ 931 if (inp->inp_flags & INP_DROPPED) { 932 destroy_server(sc, lctx); 933 INP_WUNLOCK(inp); 934 return (status); 935 } 936 937 /* 938 * Failed to start hardware listener. Take inp out of the hash and 939 * release our reference on it. An error message has been logged 940 * already. 941 */ 942 if (status != CPL_ERR_NONE) { 943 listen_hash_del(sc, inp); 944 if (release_lctx(sc, lctx) != NULL) 945 INP_WUNLOCK(inp); 946 return (status); 947 } 948 949 /* hardware listener open for business */ 950 951 INP_WUNLOCK(inp); 952 return (status); 953 } 954 955 static int 956 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 957 struct mbuf *m) 958 { 959 struct adapter *sc = iq->adapter; 960 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 961 int stid = GET_TID(cpl); 962 unsigned int status = cpl->status; 963 struct listen_ctx *lctx = lookup_stid(sc, stid); 964 struct inpcb *inp = lctx->inp; 965 #ifdef INVARIANTS 966 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 967 #endif 968 969 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 970 ("%s: unexpected opcode 0x%x", __func__, opcode)); 971 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 972 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 973 974 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 975 976 if (status != CPL_ERR_NONE) { 977 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 978 __func__, status, stid); 979 return (status); 980 } 981 982 INP_WLOCK(inp); 983 inp = release_lctx(sc, lctx); 984 if (inp != NULL) 985 INP_WUNLOCK(inp); 986 987 return (status); 988 } 989 990 static void 991 done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 992 { 993 struct tom_data *td = sc->tom_softc; 994 struct listen_ctx *lctx = synqe->lctx; 995 struct inpcb *inp = lctx->inp; 996 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 997 int ntids; 998 999 INP_WLOCK_ASSERT(inp); 1000 1001 if (synqe->tid != -1) { 1002 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 1003 remove_tid(sc, synqe->tid, ntids); 1004 mtx_lock(&td->toep_list_lock); 1005 TAILQ_REMOVE(&td->synqe_list, synqe, link); 1006 mtx_unlock(&td->toep_list_lock); 1007 release_tid(sc, synqe->tid, lctx->ctrlq); 1008 } 1009 t4_l2t_release(e); 1010 inp = release_synqe(sc, synqe); 1011 if (inp) 1012 INP_WUNLOCK(inp); 1013 } 1014 1015 void 1016 synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe) 1017 { 1018 INP_WLOCK(synqe->lctx->inp); 1019 done_with_synqe(sc, synqe); 1020 } 1021 1022 int 1023 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 1024 struct mbuf *m) 1025 { 1026 struct adapter *sc = iq->adapter; 1027 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1028 unsigned int tid = GET_TID(cpl); 1029 struct synq_entry *synqe = lookup_tid(sc, tid); 1030 struct listen_ctx *lctx = synqe->lctx; 1031 struct inpcb *inp = lctx->inp; 1032 struct sge_ofld_txq *ofld_txq; 1033 #ifdef INVARIANTS 1034 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1035 #endif 1036 1037 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1038 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1039 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1040 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 1041 1042 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 1043 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 1044 1045 if (negative_advice(cpl->status)) 1046 return (0); /* Ignore negative advice */ 1047 1048 INP_WLOCK(inp); 1049 1050 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 1051 1052 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 1053 send_flowc_wr_synqe(sc, synqe); 1054 1055 /* 1056 * If we'd initiated an abort earlier the reply to it is responsible for 1057 * cleaning up resources. Otherwise we tear everything down right here 1058 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1059 */ 1060 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 1061 INP_WUNLOCK(inp); 1062 goto done; 1063 } 1064 1065 done_with_synqe(sc, synqe); 1066 /* inp lock released by done_with_synqe */ 1067 done: 1068 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1069 return (0); 1070 } 1071 1072 int 1073 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 1074 struct mbuf *m) 1075 { 1076 struct adapter *sc = iq->adapter; 1077 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1078 unsigned int tid = GET_TID(cpl); 1079 struct synq_entry *synqe = lookup_tid(sc, tid); 1080 struct listen_ctx *lctx = synqe->lctx; 1081 struct inpcb *inp = lctx->inp; 1082 #ifdef INVARIANTS 1083 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1084 #endif 1085 1086 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1087 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1088 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1089 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 1090 1091 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 1092 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 1093 1094 INP_WLOCK(inp); 1095 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1096 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 1097 __func__, synqe, synqe->flags)); 1098 1099 done_with_synqe(sc, synqe); 1100 /* inp lock released by done_with_synqe */ 1101 1102 return (0); 1103 } 1104 1105 void 1106 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 1107 { 1108 struct adapter *sc = tod->tod_softc; 1109 struct tom_data *td = sc->tom_softc; 1110 struct synq_entry *synqe = arg; 1111 struct inpcb *inp = sotoinpcb(so); 1112 struct toepcb *toep = synqe->toep; 1113 1114 NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ 1115 INP_WLOCK_ASSERT(inp); 1116 KASSERT(synqe->flags & TPF_SYNQE, 1117 ("%s: %p not a synq_entry?", __func__, arg)); 1118 MPASS(toep->tid == synqe->tid); 1119 1120 offload_socket(so, toep); 1121 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); 1122 toep->flags |= TPF_CPL_PENDING; 1123 update_tid(sc, synqe->tid, toep); 1124 synqe->flags |= TPF_SYNQE_EXPANDED; 1125 mtx_lock(&td->toep_list_lock); 1126 /* Remove synqe from its list and add the TOE PCB to the active list. */ 1127 TAILQ_REMOVE(&td->synqe_list, synqe, link); 1128 TAILQ_INSERT_TAIL(&td->toep_list, toep, link); 1129 toep->flags |= TPF_IN_TOEP_LIST; 1130 mtx_unlock(&td->toep_list_lock); 1131 inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? 1132 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; 1133 inp->inp_flowid = synqe->rss_hash; 1134 } 1135 1136 static void 1137 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 1138 { 1139 bzero(to, sizeof(*to)); 1140 1141 if (t4opt->mss) { 1142 to->to_flags |= TOF_MSS; 1143 to->to_mss = be16toh(t4opt->mss); 1144 } 1145 1146 if (t4opt->wsf > 0 && t4opt->wsf < 15) { 1147 to->to_flags |= TOF_SCALE; 1148 to->to_wscale = t4opt->wsf; 1149 } 1150 1151 if (t4opt->tstamp) 1152 to->to_flags |= TOF_TS; 1153 1154 if (t4opt->sack) 1155 to->to_flags |= TOF_SACKPERM; 1156 } 1157 1158 static bool 1159 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) 1160 { 1161 u_int hlen = be32toh(cpl->hdr_len); 1162 1163 if (chip_id(sc) >= CHELSIO_T6) 1164 return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1165 else 1166 return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1167 } 1168 1169 static void 1170 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1171 struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) 1172 { 1173 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1174 const struct ether_header *eh; 1175 unsigned int hlen = be32toh(cpl->hdr_len); 1176 uintptr_t l3hdr; 1177 const struct tcphdr *tcp; 1178 1179 eh = (const void *)(cpl + 1); 1180 if (chip_id(sc) >= CHELSIO_T6) { 1181 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1182 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1183 } else { 1184 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1185 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1186 } 1187 1188 /* extract TOS (DiffServ + ECN) byte for AccECN */ 1189 if (iptos) { 1190 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1191 const struct ip *ip = (const void *)l3hdr; 1192 *iptos = ip->ip_tos; 1193 } 1194 #ifdef INET6 1195 else 1196 if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { 1197 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1198 *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1199 } 1200 #endif /* INET */ 1201 } 1202 1203 if (inc) { 1204 bzero(inc, sizeof(*inc)); 1205 inc->inc_fport = tcp->th_sport; 1206 inc->inc_lport = tcp->th_dport; 1207 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1208 const struct ip *ip = (const void *)l3hdr; 1209 1210 inc->inc_faddr = ip->ip_src; 1211 inc->inc_laddr = ip->ip_dst; 1212 } else { 1213 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1214 1215 inc->inc_flags |= INC_ISIPV6; 1216 inc->inc6_faddr = ip6->ip6_src; 1217 inc->inc6_laddr = ip6->ip6_dst; 1218 } 1219 } 1220 1221 if (th) { 1222 bcopy(tcp, th, sizeof(*th)); 1223 tcp_fields_to_host(th); /* just like tcp_input */ 1224 } 1225 } 1226 1227 static struct l2t_entry * 1228 get_l2te_for_nexthop(struct port_info *pi, if_t ifp, 1229 struct in_conninfo *inc) 1230 { 1231 struct l2t_entry *e; 1232 struct sockaddr_in6 sin6; 1233 struct sockaddr *dst = (void *)&sin6; 1234 struct nhop_object *nh; 1235 1236 if (inc->inc_flags & INC_ISIPV6) { 1237 bzero(dst, sizeof(struct sockaddr_in6)); 1238 dst->sa_len = sizeof(struct sockaddr_in6); 1239 dst->sa_family = AF_INET6; 1240 1241 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1242 /* no need for route lookup */ 1243 e = t4_l2t_get(pi, ifp, dst); 1244 return (e); 1245 } 1246 1247 nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); 1248 if (nh == NULL) 1249 return (NULL); 1250 if (nh->nh_ifp != ifp) 1251 return (NULL); 1252 if (nh->nh_flags & NHF_GATEWAY) 1253 ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; 1254 else 1255 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; 1256 } else { 1257 dst->sa_len = sizeof(struct sockaddr_in); 1258 dst->sa_family = AF_INET; 1259 1260 nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); 1261 if (nh == NULL) 1262 return (NULL); 1263 if (nh->nh_ifp != ifp) 1264 return (NULL); 1265 if (nh->nh_flags & NHF_GATEWAY) 1266 if (nh->gw_sa.sa_family == AF_INET) 1267 ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; 1268 else 1269 *((struct sockaddr_in6 *)dst) = nh->gw6_sa; 1270 else 1271 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; 1272 } 1273 1274 e = t4_l2t_get(pi, ifp, dst); 1275 return (e); 1276 } 1277 1278 static int 1279 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, 1280 uint32_t opt2, int tid) 1281 { 1282 struct wrqe *wr; 1283 struct cpl_pass_accept_rpl *rpl; 1284 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1285 1286 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1287 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); 1288 if (wr == NULL) 1289 return (ENOMEM); 1290 rpl = wrtod(wr); 1291 1292 if (is_t4(sc)) 1293 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1294 else { 1295 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1296 1297 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1298 rpl5->iss = htobe32(synqe->iss); 1299 } 1300 rpl->opt0 = opt0; 1301 rpl->opt2 = opt2; 1302 1303 return (t4_l2t_send(sc, wr, e)); 1304 } 1305 1306 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ 1307 if (!tunnel) { \ 1308 m_freem(m); \ 1309 m = NULL; \ 1310 } \ 1311 reject_reason = __LINE__; \ 1312 goto reject; \ 1313 } while (0) 1314 1315 /* 1316 * The context associated with a tid entry via insert_tid could be a synq_entry 1317 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1318 */ 1319 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1320 1321 /* 1322 * Incoming SYN on a listening socket. 1323 * 1324 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1325 * etc. 1326 */ 1327 static int 1328 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1329 struct mbuf *m) 1330 { 1331 struct adapter *sc = iq->adapter; 1332 struct tom_data *td = sc->tom_softc; 1333 struct toedev *tod; 1334 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1335 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1336 unsigned int tid = GET_TID(cpl); 1337 struct listen_ctx *lctx = lookup_stid(sc, stid); 1338 struct inpcb *inp; 1339 struct socket *so; 1340 struct in_conninfo inc; 1341 struct tcphdr th; 1342 struct tcpopt to; 1343 struct port_info *pi; 1344 struct vi_info *vi; 1345 if_t hw_ifp, ifp; 1346 struct l2t_entry *e = NULL; 1347 struct synq_entry *synqe = NULL; 1348 int reject_reason, v, ntids; 1349 uint16_t vid, l2info; 1350 struct epoch_tracker et; 1351 #ifdef INVARIANTS 1352 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1353 #endif 1354 struct offload_settings settings; 1355 uint8_t iptos; 1356 1357 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1358 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1359 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1360 1361 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1362 lctx); 1363 1364 /* 1365 * Figure out the port the SYN arrived on. We'll look for an exact VI 1366 * match in a bit but in case we don't find any we'll use the main VI as 1367 * the incoming ifnet. 1368 */ 1369 l2info = be16toh(cpl->l2info); 1370 pi = sc->port[G_SYN_INTF(l2info)]; 1371 hw_ifp = pi->vi[0].ifp; 1372 m->m_pkthdr.rcvif = hw_ifp; 1373 1374 CURVNET_SET(lctx->vnet); /* before any potential REJECT */ 1375 1376 /* 1377 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will 1378 * also hit the listener. We don't want to offload those. 1379 */ 1380 if (encapsulated_syn(sc, cpl)) { 1381 REJECT_PASS_ACCEPT_REQ(true); 1382 } 1383 1384 /* 1385 * Use the MAC index to lookup the associated VI. If this SYN didn't 1386 * match a perfect MAC filter, punt. 1387 */ 1388 if (!(l2info & F_SYN_XACT_MATCH)) { 1389 REJECT_PASS_ACCEPT_REQ(true); 1390 } 1391 for_each_vi(pi, v, vi) { 1392 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) 1393 goto found; 1394 } 1395 REJECT_PASS_ACCEPT_REQ(true); 1396 found: 1397 hw_ifp = vi->ifp; /* the cxgbe ifnet */ 1398 m->m_pkthdr.rcvif = hw_ifp; 1399 tod = TOEDEV(hw_ifp); 1400 1401 /* 1402 * Don't offload if the peer requested a TCP option that's not known to 1403 * the silicon. Send the SYN to the kernel instead. 1404 */ 1405 if (__predict_false(cpl->tcpopt.unknown)) 1406 REJECT_PASS_ACCEPT_REQ(true); 1407 1408 /* 1409 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1410 * involved. Don't offload if the SYN had a VLAN tag and the vid 1411 * doesn't match anything on this interface. 1412 * 1413 * XXX: lagg support, lagg + vlan support. 1414 */ 1415 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1416 if (vid != 0xfff && vid != 0) { 1417 ifp = VLAN_DEVAT(hw_ifp, vid); 1418 if (ifp == NULL) 1419 REJECT_PASS_ACCEPT_REQ(true); 1420 } else 1421 ifp = hw_ifp; 1422 1423 /* 1424 * Don't offload if the ifnet that the SYN came in on is not in the same 1425 * vnet as the listening socket. 1426 */ 1427 if (lctx->vnet != if_getvnet(ifp)) 1428 REJECT_PASS_ACCEPT_REQ(true); 1429 1430 pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); 1431 if (inc.inc_flags & INC_ISIPV6) { 1432 1433 /* Don't offload if the ifcap isn't enabled */ 1434 if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0) 1435 REJECT_PASS_ACCEPT_REQ(true); 1436 1437 /* 1438 * SYN must be directed to an IP6 address on this ifnet. This 1439 * is more restrictive than in6_localip. 1440 */ 1441 NET_EPOCH_ENTER(et); 1442 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { 1443 NET_EPOCH_EXIT(et); 1444 REJECT_PASS_ACCEPT_REQ(true); 1445 } 1446 1447 ntids = 2; 1448 } else { 1449 1450 /* Don't offload if the ifcap isn't enabled */ 1451 if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0) 1452 REJECT_PASS_ACCEPT_REQ(true); 1453 1454 /* 1455 * SYN must be directed to an IP address on this ifnet. This 1456 * is more restrictive than in_localip. 1457 */ 1458 NET_EPOCH_ENTER(et); 1459 if (!in_ifhasaddr(ifp, inc.inc_laddr)) { 1460 NET_EPOCH_EXIT(et); 1461 REJECT_PASS_ACCEPT_REQ(true); 1462 } 1463 1464 ntids = 1; 1465 } 1466 1467 e = get_l2te_for_nexthop(pi, ifp, &inc); 1468 if (e == NULL) { 1469 NET_EPOCH_EXIT(et); 1470 REJECT_PASS_ACCEPT_REQ(true); 1471 } 1472 1473 /* Don't offload if the 4-tuple is already in use */ 1474 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1475 NET_EPOCH_EXIT(et); 1476 REJECT_PASS_ACCEPT_REQ(false); 1477 } 1478 1479 inp = lctx->inp; /* listening socket, not owned by TOE */ 1480 INP_RLOCK(inp); 1481 1482 /* Don't offload if the listening socket has closed */ 1483 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1484 INP_RUNLOCK(inp); 1485 NET_EPOCH_EXIT(et); 1486 REJECT_PASS_ACCEPT_REQ(false); 1487 } 1488 so = inp->inp_socket; 1489 rw_rlock(&sc->policy_lock); 1490 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 1491 EVL_MAKETAG(0xfff, 0, 0), inp); 1492 rw_runlock(&sc->policy_lock); 1493 if (!settings.offload) { 1494 INP_RUNLOCK(inp); 1495 NET_EPOCH_EXIT(et); 1496 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ 1497 } 1498 1499 synqe = alloc_synqe(sc, lctx, M_NOWAIT); 1500 if (synqe == NULL) { 1501 INP_RUNLOCK(inp); 1502 NET_EPOCH_EXIT(et); 1503 REJECT_PASS_ACCEPT_REQ(true); 1504 } 1505 MPASS(rss->hash_type == RSS_HASH_TCP); 1506 synqe->rss_hash = be32toh(rss->hash_val); 1507 atomic_store_int(&synqe->ok_to_respond, 0); 1508 1509 init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, 1510 &synqe->params); 1511 if (sc->params.tid_qid_sel_mask != 0) 1512 update_tid_qid_sel(vi, &synqe->params, tid); 1513 1514 /* 1515 * If all goes well t4_syncache_respond will get called during 1516 * syncache_add. Note that syncache_add releases the pcb lock. 1517 */ 1518 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1519 toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); 1520 1521 if (atomic_load_int(&synqe->ok_to_respond) > 0) { 1522 uint64_t opt0; 1523 uint32_t opt2; 1524 1525 opt0 = calc_options0(vi, &synqe->params); 1526 opt2 = calc_options2(vi, &synqe->params); 1527 1528 insert_tid(sc, tid, synqe, ntids); 1529 synqe->tid = tid; 1530 synqe->syn = m; 1531 m = NULL; 1532 mtx_lock(&td->toep_list_lock); 1533 TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link); 1534 mtx_unlock(&td->toep_list_lock); 1535 1536 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { 1537 remove_tid(sc, tid, ntids); 1538 m = synqe->syn; 1539 synqe->syn = NULL; 1540 mtx_lock(&td->toep_list_lock); 1541 TAILQ_REMOVE(&td->synqe_list, synqe, link); 1542 mtx_unlock(&td->toep_list_lock); 1543 NET_EPOCH_EXIT(et); 1544 REJECT_PASS_ACCEPT_REQ(true); 1545 } 1546 CTR6(KTR_CXGBE, 1547 "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", 1548 __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); 1549 } else { 1550 NET_EPOCH_EXIT(et); 1551 REJECT_PASS_ACCEPT_REQ(false); 1552 } 1553 1554 NET_EPOCH_EXIT(et); 1555 CURVNET_RESTORE(); 1556 return (0); 1557 reject: 1558 CURVNET_RESTORE(); 1559 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1560 reject_reason); 1561 1562 if (e) 1563 t4_l2t_release(e); 1564 release_tid(sc, tid, lctx->ctrlq); 1565 if (synqe) { 1566 inp = synqe->lctx->inp; 1567 INP_WLOCK(inp); 1568 inp = release_synqe(sc, synqe); 1569 if (inp) 1570 INP_WUNLOCK(inp); 1571 } 1572 1573 if (m) { 1574 /* 1575 * The connection request hit a TOE listener but is being passed 1576 * on to the kernel sw stack instead of getting offloaded. 1577 */ 1578 m_adj(m, sizeof(*cpl)); 1579 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1580 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1581 m->m_pkthdr.csum_data = 0xffff; 1582 if_input(hw_ifp, m); 1583 } 1584 1585 return (reject_reason); 1586 } 1587 1588 static void 1589 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1590 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1591 struct tcphdr *th, struct tcpopt *to) 1592 { 1593 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1594 uint8_t iptos; 1595 1596 /* start off with the original SYN */ 1597 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); 1598 1599 /* modify parts to make it look like the ACK to our SYN|ACK */ 1600 tcp_set_flags(th, TH_ACK); 1601 th->th_ack = synqe->iss + 1; 1602 th->th_seq = be32toh(cpl->rcv_isn); 1603 bzero(to, sizeof(*to)); 1604 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1605 to->to_flags |= TOF_TS; 1606 to->to_tsecr = synqe->ts; 1607 } 1608 } 1609 1610 static int 1611 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1612 struct mbuf *m) 1613 { 1614 struct adapter *sc = iq->adapter; 1615 struct vi_info *vi; 1616 if_t ifp; 1617 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1618 #if defined(KTR) || defined(INVARIANTS) 1619 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1620 #endif 1621 unsigned int tid = GET_TID(cpl); 1622 struct synq_entry *synqe = lookup_tid(sc, tid); 1623 struct listen_ctx *lctx = synqe->lctx; 1624 struct inpcb *inp = lctx->inp, *new_inp; 1625 struct socket *so; 1626 struct tcphdr th; 1627 struct tcpopt to; 1628 struct in_conninfo inc; 1629 struct toepcb *toep; 1630 struct epoch_tracker et; 1631 int rstreason; 1632 #ifdef INVARIANTS 1633 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1634 #endif 1635 1636 KASSERT(opcode == CPL_PASS_ESTABLISH, 1637 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1638 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1639 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1640 KASSERT(synqe->flags & TPF_SYNQE, 1641 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1642 1643 CURVNET_SET(lctx->vnet); 1644 NET_EPOCH_ENTER(et); /* for syncache_expand */ 1645 INP_WLOCK(inp); 1646 1647 CTR6(KTR_CXGBE, 1648 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1649 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1650 1651 ifp = synqe->syn->m_pkthdr.rcvif; 1652 vi = if_getsoftc(ifp); 1653 KASSERT(vi->adapter == sc, 1654 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1655 1656 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1657 reset: 1658 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); 1659 INP_WUNLOCK(inp); 1660 NET_EPOCH_EXIT(et); 1661 CURVNET_RESTORE(); 1662 return (0); 1663 } 1664 1665 KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1666 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, 1667 synqe->params.rxq_idx, 1668 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1669 1670 toep = alloc_toepcb(vi, M_NOWAIT); 1671 if (toep == NULL) 1672 goto reset; 1673 toep->tid = tid; 1674 toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1675 toep->vnet = lctx->vnet; 1676 bcopy(&synqe->params, &toep->params, sizeof(toep->params)); 1677 init_toepcb(vi, toep); 1678 1679 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); 1680 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); 1681 synqe->tcp_opt = cpl->tcp_opt; 1682 synqe->toep = toep; 1683 1684 /* Come up with something that syncache_expand should be ok with. */ 1685 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1686 if (inc.inc_flags & INC_ISIPV6) { 1687 if (lctx->ce == NULL) { 1688 toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); 1689 if (toep->ce == NULL) { 1690 free_toepcb(toep); 1691 goto reset; /* RST without a CLIP entry? */ 1692 } 1693 } else { 1694 t4_hold_clip_entry(sc, lctx->ce); 1695 toep->ce = lctx->ce; 1696 } 1697 } 1698 so = inp->inp_socket; 1699 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1700 1701 rstreason = toe_syncache_expand(&inc, &to, &th, &so); 1702 if (rstreason < 0) { 1703 free_toepcb(toep); 1704 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); 1705 INP_WUNLOCK(inp); 1706 NET_EPOCH_EXIT(et); 1707 CURVNET_RESTORE(); 1708 return (0); 1709 } else if (rstreason == 0 || so == NULL) { 1710 free_toepcb(toep); 1711 goto reset; 1712 } 1713 1714 /* New connection inpcb is already locked by syncache_expand(). */ 1715 new_inp = sotoinpcb(so); 1716 INP_WLOCK_ASSERT(new_inp); 1717 MPASS(so->so_vnet == lctx->vnet); 1718 1719 /* 1720 * This is for expansion from syncookies. 1721 * 1722 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1723 * anyone accept'ing a connection before we've installed our hooks, but 1724 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1725 */ 1726 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1727 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1728 t4_offload_socket(TOEDEV(ifp), synqe, so); 1729 } 1730 1731 INP_WUNLOCK(new_inp); 1732 1733 /* Done with the synqe */ 1734 inp = release_synqe(sc, synqe); 1735 if (inp != NULL) 1736 INP_WUNLOCK(inp); 1737 NET_EPOCH_EXIT(et); 1738 CURVNET_RESTORE(); 1739 1740 return (0); 1741 } 1742 1743 void 1744 t4_init_listen_cpl_handlers(void) 1745 { 1746 1747 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1748 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1749 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1750 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1751 } 1752 1753 void 1754 t4_uninit_listen_cpl_handlers(void) 1755 { 1756 1757 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1758 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1759 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1760 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1761 } 1762 #endif 1763