xref: /freebsd/sys/dev/cxgbe/tom/t4_listen.c (revision 40dbb06fa73cac37d57563c07e55efd0cabbd488)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #ifdef TCP_OFFLOAD
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/module.h>
40 #include <sys/protosw.h>
41 #include <sys/refcount.h>
42 #include <sys/domain.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_vlan_var.h>
51 #include <net/route.h>
52 #include <net/route/nhop.h>
53 #include <netinet/in.h>
54 #include <netinet/in_fib.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet6/in6_fib.h>
59 #include <netinet6/scope6_var.h>
60 #include <netinet/tcp_timer.h>
61 #define TCPSTATES
62 #include <netinet/tcp_fsm.h>
63 #include <netinet/tcp_var.h>
64 #include <netinet/toecore.h>
65 #include <netinet/cc/cc.h>
66 
67 #include "common/common.h"
68 #include "common/t4_msg.h"
69 #include "common/t4_regs.h"
70 #include "t4_clip.h"
71 #include "tom/t4_tom_l2t.h"
72 #include "tom/t4_tom.h"
73 
74 /* stid services */
75 static int alloc_stid(struct adapter *, bool, void *);
76 static struct listen_ctx *lookup_stid(struct adapter *, int);
77 static void free_stid(struct adapter *, int , bool);
78 
79 /* lctx services */
80 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
81     struct vi_info *);
82 static int free_lctx(struct adapter *, struct listen_ctx *);
83 static void hold_lctx(struct listen_ctx *);
84 static void listen_hash_add(struct adapter *, struct listen_ctx *);
85 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
86 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
87 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
88 
89 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
90 
91 static int create_server6(struct adapter *, struct listen_ctx *);
92 static int create_server(struct adapter *, struct listen_ctx *);
93 
94 int
alloc_stid_tab(struct adapter * sc)95 alloc_stid_tab(struct adapter *sc)
96 {
97 	struct tid_info *t = &sc->tids;
98 
99 	MPASS(t->nstids > 0);
100 	MPASS(t->stid_tab == NULL);
101 
102 	t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
103 	    M_ZERO | M_NOWAIT);
104 	if (t->stid_tab == NULL)
105 		return (ENOMEM);
106 	t->stid_bitmap = bit_alloc(t->nstids, M_CXGBE, M_NOWAIT);
107 	if (t->stid_bitmap == NULL) {
108 		free(t->stid_tab, M_CXGBE);
109 		t->stid_tab = NULL;
110 		return (ENOMEM);
111 	}
112 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
113 	t->stids_in_use = 0;
114 
115 	return (0);
116 }
117 
118 void
free_stid_tab(struct adapter * sc)119 free_stid_tab(struct adapter *sc)
120 {
121 	struct tid_info *t = &sc->tids;
122 
123 	KASSERT(t->stids_in_use == 0,
124 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
125 
126 	if (mtx_initialized(&t->stid_lock))
127 		mtx_destroy(&t->stid_lock);
128 	free(t->stid_tab, M_CXGBE);
129 	t->stid_tab = NULL;
130 	free(t->stid_bitmap, M_CXGBE);
131 	t->stid_bitmap = NULL;
132 }
133 
134 void
stop_stid_tab(struct adapter * sc)135 stop_stid_tab(struct adapter *sc)
136 {
137 	struct tid_info *t = &sc->tids;
138 	struct tom_data *td = sc->tom_softc;
139 	struct listen_ctx *lctx;
140 	struct synq_entry *synqe;
141 	int i, ntids;
142 
143 	mtx_lock(&t->stid_lock);
144 	t->stid_tab_stopped = true;
145 	mtx_unlock(&t->stid_lock);
146 
147 	mtx_lock(&td->lctx_hash_lock);
148 	for (i = 0; i <= td->listen_mask; i++) {
149 		LIST_FOREACH(lctx, &td->listen_hash[i], link)
150 			lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW);
151 	}
152 	mtx_unlock(&td->lctx_hash_lock);
153 
154 	mtx_lock(&td->toep_list_lock);
155 	TAILQ_FOREACH(synqe, &td->synqe_list, link) {
156 		MPASS(sc->incarnation == synqe->incarnation);
157 		MPASS(synqe->tid >= 0);
158 		MPASS(synqe == lookup_tid(sc, synqe->tid));
159 		/* Remove tid from the lookup table immediately. */
160 		CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
161 		    __func__, synqe->tid, synqe->incarnation);
162 		ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1;
163 		remove_tid(sc, synqe->tid, ntids);
164 #if 0
165 		/* synqe->tid is stale now but left alone for debug. */
166 		synqe->tid = -1;
167 #endif
168 	}
169 	MPASS(TAILQ_EMPTY(&td->stranded_synqe));
170 	TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link);
171 	MPASS(TAILQ_EMPTY(&td->synqe_list));
172 	mtx_unlock(&td->toep_list_lock);
173 }
174 
175 void
restart_stid_tab(struct adapter * sc)176 restart_stid_tab(struct adapter *sc)
177 {
178 	struct tid_info *t = &sc->tids;
179 	struct tom_data *td = sc->tom_softc;
180 	struct listen_ctx *lctx;
181 	int i;
182 
183 	mtx_lock(&td->lctx_hash_lock);
184 	for (i = 0; i <= td->listen_mask; i++) {
185 		LIST_FOREACH(lctx, &td->listen_hash[i], link) {
186 			MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0);
187 			lctx->flags |= LCTX_RPL_PENDING;
188 			if (lctx->inp->inp_vflag & INP_IPV6)
189 				create_server6(sc, lctx);
190 			else
191 				create_server(sc, lctx);
192 		}
193 	}
194 	mtx_unlock(&td->lctx_hash_lock);
195 
196 	mtx_lock(&t->stid_lock);
197 	t->stid_tab_stopped = false;
198 	mtx_unlock(&t->stid_lock);
199 
200 }
201 
202 static int
alloc_stid(struct adapter * sc,bool isipv6,void * ctx)203 alloc_stid(struct adapter *sc, bool isipv6, void *ctx)
204 {
205 	struct tid_info *t = &sc->tids;
206 	const u_int n = isipv6 ? 2 : 1;
207 	int stid, pair_stid;
208 	u_int i;
209 	ssize_t val;
210 
211 	mtx_lock(&t->stid_lock);
212 	MPASS(t->stids_in_use <= t->nstids);
213 	if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) {
214 		mtx_unlock(&t->stid_lock);
215 		return (-1);
216 	}
217 
218 	stid = -1;
219 	if (isipv6) {
220 		/*
221 		 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4
222 		 * cells) in the TCAM.  We know that the start of the stid
223 		 * region is properly aligned already (the chip requires each
224 		 * region to be 128-cell aligned).
225 		 */
226 		for (i = 0; i + 1 < t->nstids; i = roundup2(val + 1, 2)) {
227 			bit_ffc_area_at(t->stid_bitmap, i, t->nstids, 2, &val);
228 			if (val == -1)
229 				break;
230 			if ((val & 1) == 0) {
231 				stid = val;
232 				break;
233 			}
234 		}
235 	} else {
236 		/*
237 		 * An IPv4 server needs one stid without any alignment
238 		 * requirements.  But we try extra hard to find an available
239 		 * stid adjacent to a used stid so that free "stid-pairs" are
240 		 * left intact for IPv6.
241 		 */
242 		bit_ffc_at(t->stid_bitmap, 0, t->nstids, &val);
243 		while (val != -1) {
244 			if (stid == -1) {
245 				/*
246 				 * First usable stid.  Look no further if it's
247 				 * an ideal fit.
248 				 */
249 				stid = val;
250 				if (val & 1 || bit_test(t->stid_bitmap, val + 1))
251 					break;
252 			} else {
253 				/*
254 				 * We have an unused stid already but are now
255 				 * looking for in-use stids because we'd prefer
256 				 * to grab an unused stid adjacent to one that's
257 				 * in use.
258 				 *
259 				 * Odd stids pair with the previous stid and
260 				 * even ones pair with the next stid.
261 				 */
262 				pair_stid = val & 1 ? val - 1 : val + 1;
263 				if (bit_test(t->stid_bitmap, pair_stid) == 0) {
264 					stid = pair_stid;
265 					break;
266 				}
267 			}
268 			val = roundup2(val + 1, 2);
269 			if (val >= t->nstids)
270 				break;
271 			bit_ffs_at(t->stid_bitmap, val, t->nstids, &val);
272 		}
273 	}
274 
275 	if (stid >= 0) {
276 		MPASS(stid + n - 1 < t->nstids);
277 		MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 0));
278 		bit_nset(t->stid_bitmap, stid, stid + n - 1);
279 		t->stids_in_use += n;
280 		t->stid_tab[stid] = ctx;
281 #ifdef INVARIANTS
282 		if (n == 2) {
283 			MPASS((stid & 1) == 0);
284 			t->stid_tab[stid + 1] = NULL;
285 		}
286 #endif
287 		stid += t->stid_base;
288 	}
289 	mtx_unlock(&t->stid_lock);
290 	return (stid);
291 }
292 
293 static struct listen_ctx *
lookup_stid(struct adapter * sc,int stid)294 lookup_stid(struct adapter *sc, int stid)
295 {
296 	struct tid_info *t = &sc->tids;
297 
298 	return (t->stid_tab[stid - t->stid_base]);
299 }
300 
301 static void
free_stid(struct adapter * sc,int stid,bool isipv6)302 free_stid(struct adapter *sc, int stid, bool isipv6)
303 {
304 	struct tid_info *t = &sc->tids;
305 	const u_int n = isipv6 ? 2 : 1;
306 
307 	mtx_lock(&t->stid_lock);
308 	MPASS(stid >= t->stid_base);
309 	stid -= t->stid_base;
310 	MPASS(stid + n - 1 < t->nstids);
311 	MPASS(t->stids_in_use <= t->nstids);
312 	MPASS(t->stids_in_use >= n);
313 	MPASS(t->stid_tab[stid] != NULL);
314 #ifdef INVARIANTS
315 	if (n == 2) {
316 		MPASS((stid & 1) == 0);
317 		MPASS(t->stid_tab[stid + 1] == NULL);
318 	}
319 #endif
320 	MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 1));
321 	bit_nclear(t->stid_bitmap, stid, stid + n - 1);
322 	t->stid_tab[stid] = NULL;
323 	t->stids_in_use -= n;
324 	mtx_unlock(&t->stid_lock);
325 }
326 
327 static struct listen_ctx *
alloc_lctx(struct adapter * sc,struct inpcb * inp,struct vi_info * vi)328 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
329 {
330 	struct listen_ctx *lctx;
331 
332 	INP_WLOCK_ASSERT(inp);
333 
334 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
335 	if (lctx == NULL)
336 		return (NULL);
337 
338 	lctx->isipv6 = inp->inp_vflag & INP_IPV6;
339 	lctx->stid = alloc_stid(sc, lctx->isipv6, lctx);
340 	if (lctx->stid < 0) {
341 		free(lctx, M_CXGBE);
342 		return (NULL);
343 	}
344 
345 	if (lctx->isipv6 &&
346 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
347 		lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
348 		if (lctx->ce == NULL) {
349 			free(lctx, M_CXGBE);
350 			return (NULL);
351 		}
352 	}
353 
354 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
355 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
356 	refcount_init(&lctx->refcount, 1);
357 
358 	lctx->inp = inp;
359 	lctx->vnet = inp->inp_socket->so_vnet;
360 	in_pcbref(inp);
361 
362 	return (lctx);
363 }
364 
365 /* Don't call this directly, use release_lctx instead */
366 static int
free_lctx(struct adapter * sc,struct listen_ctx * lctx)367 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
368 {
369 	struct inpcb *inp = lctx->inp;
370 
371 	INP_WLOCK_ASSERT(inp);
372 	KASSERT(lctx->refcount == 0,
373 	    ("%s: refcount %d", __func__, lctx->refcount));
374 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
375 
376 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
377 	    __func__, lctx->stid, lctx, lctx->inp);
378 
379 	if (lctx->ce)
380 		t4_release_clip_entry(sc, lctx->ce);
381 	free_stid(sc, lctx->stid, lctx->isipv6);
382 	free(lctx, M_CXGBE);
383 
384 	return (in_pcbrele_wlocked(inp));
385 }
386 
387 static void
hold_lctx(struct listen_ctx * lctx)388 hold_lctx(struct listen_ctx *lctx)
389 {
390 
391 	refcount_acquire(&lctx->refcount);
392 }
393 
394 static inline uint32_t
listen_hashfn(void * key,u_long mask)395 listen_hashfn(void *key, u_long mask)
396 {
397 
398 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
399 }
400 
401 /*
402  * Add a listen_ctx entry to the listen hash table.
403  */
404 static void
listen_hash_add(struct adapter * sc,struct listen_ctx * lctx)405 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
406 {
407 	struct tom_data *td = sc->tom_softc;
408 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
409 
410 	mtx_lock(&td->lctx_hash_lock);
411 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
412 	td->lctx_count++;
413 	mtx_unlock(&td->lctx_hash_lock);
414 }
415 
416 /*
417  * Look for the listening socket's context entry in the hash and return it.
418  */
419 static struct listen_ctx *
listen_hash_find(struct adapter * sc,struct inpcb * inp)420 listen_hash_find(struct adapter *sc, struct inpcb *inp)
421 {
422 	struct tom_data *td = sc->tom_softc;
423 	int bucket = listen_hashfn(inp, td->listen_mask);
424 	struct listen_ctx *lctx;
425 
426 	mtx_lock(&td->lctx_hash_lock);
427 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
428 		if (lctx->inp == inp)
429 			break;
430 	}
431 	mtx_unlock(&td->lctx_hash_lock);
432 
433 	return (lctx);
434 }
435 
436 /*
437  * Removes the listen_ctx structure for inp from the hash and returns it.
438  */
439 static struct listen_ctx *
listen_hash_del(struct adapter * sc,struct inpcb * inp)440 listen_hash_del(struct adapter *sc, struct inpcb *inp)
441 {
442 	struct tom_data *td = sc->tom_softc;
443 	int bucket = listen_hashfn(inp, td->listen_mask);
444 	struct listen_ctx *lctx, *l;
445 
446 	mtx_lock(&td->lctx_hash_lock);
447 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
448 		if (lctx->inp == inp) {
449 			LIST_REMOVE(lctx, link);
450 			td->lctx_count--;
451 			break;
452 		}
453 	}
454 	mtx_unlock(&td->lctx_hash_lock);
455 
456 	return (lctx);
457 }
458 
459 /*
460  * Releases a hold on the lctx.  Must be called with the listening socket's inp
461  * locked.  The inp may be freed by this function and it returns NULL to
462  * indicate this.
463  */
464 static struct inpcb *
release_lctx(struct adapter * sc,struct listen_ctx * lctx)465 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
466 {
467 	struct inpcb *inp = lctx->inp;
468 	int inp_freed = 0;
469 
470 	INP_WLOCK_ASSERT(inp);
471 	if (refcount_release(&lctx->refcount))
472 		inp_freed = free_lctx(sc, lctx);
473 
474 	return (inp_freed ? NULL : inp);
475 }
476 
477 static void
send_flowc_wr_synqe(struct adapter * sc,struct synq_entry * synqe)478 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
479 {
480 	struct mbuf *m = synqe->syn;
481 	if_t ifp = m->m_pkthdr.rcvif;
482 	struct vi_info *vi = if_getsoftc(ifp);
483 	struct port_info *pi = vi->pi;
484 	struct wrqe *wr;
485 	struct fw_flowc_wr *flowc;
486 	struct sge_ofld_txq *ofld_txq;
487 	struct sge_ofld_rxq *ofld_rxq;
488 	const int nparams = 6;
489 	const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
490 	const u_int pfvf = sc->pf << S_FW_VIID_PFN;
491 
492 	INP_WLOCK_ASSERT(synqe->lctx->inp);
493 	MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);
494 
495 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
496 	ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
497 
498 	wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
499 	if (wr == NULL) {
500 		/* XXX */
501 		panic("%s: allocation failure.", __func__);
502 	}
503 	flowc = wrtod(wr);
504 	memset(flowc, 0, wr->wr_len);
505 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
506 	    V_FW_FLOWC_WR_NPARAMS(nparams));
507 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
508 	    V_FW_WR_FLOWID(synqe->tid));
509 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
510 	flowc->mnemval[0].val = htobe32(pfvf);
511 	/* Firmware expects hw port and will translate to channel itself. */
512 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
513 	flowc->mnemval[1].val = htobe32(pi->hw_port);
514 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
515 	flowc->mnemval[2].val = htobe32(pi->hw_port);
516 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
517 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
518 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
519 	flowc->mnemval[4].val = htobe32(512);
520 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
521 	flowc->mnemval[5].val = htobe32(512);
522 
523 	synqe->flags |= TPF_FLOWC_WR_SENT;
524 	t4_wrq_tx(sc, wr);
525 }
526 
527 static void
send_abort_rpl_synqe(struct toedev * tod,struct synq_entry * synqe,int rst_status)528 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
529     int rst_status)
530 {
531 	struct adapter *sc = tod->tod_softc;
532 	struct wrqe *wr;
533 	struct cpl_abort_req *req;
534 
535 	INP_WLOCK_ASSERT(synqe->lctx->inp);
536 
537 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
538 	    __func__, synqe, synqe->flags, synqe->tid,
539 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
540 	    " (abort already in progress)" : "");
541 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
542 		return;	/* abort already in progress */
543 	synqe->flags |= TPF_ABORT_SHUTDOWN;
544 
545 	if (!(synqe->flags & TPF_FLOWC_WR_SENT))
546 		send_flowc_wr_synqe(sc, synqe);
547 
548 	wr = alloc_wrqe(sizeof(*req),
549 	    &sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
550 	if (wr == NULL) {
551 		/* XXX */
552 		panic("%s: allocation failure.", __func__);
553 	}
554 	req = wrtod(wr);
555 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
556 	req->rsvd0 = 0;	/* don't have a snd_nxt */
557 	req->rsvd1 = 1;	/* no data sent yet */
558 	req->cmd = rst_status;
559 
560 	t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
561 }
562 
563 static int
create_server(struct adapter * sc,struct listen_ctx * lctx)564 create_server(struct adapter *sc, struct listen_ctx *lctx)
565 {
566 	struct wrqe *wr;
567 	struct cpl_pass_open_req *req;
568 	struct inpcb *inp = lctx->inp;
569 
570 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
571 	if (wr == NULL) {
572 		log(LOG_ERR, "%s: allocation failure", __func__);
573 		return (ENOMEM);
574 	}
575 	req = wrtod(wr);
576 
577 	INIT_TP_WR(req, 0);
578 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
579 	req->local_port = inp->inp_lport;
580 	req->peer_port = 0;
581 	req->local_ip = inp->inp_laddr.s_addr;
582 	req->peer_ip = 0;
583 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
584 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
585 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
586 
587 	t4_wrq_tx(sc, wr);
588 	return (0);
589 }
590 
591 static int
create_server6(struct adapter * sc,struct listen_ctx * lctx)592 create_server6(struct adapter *sc, struct listen_ctx *lctx)
593 {
594 	struct wrqe *wr;
595 	struct cpl_pass_open_req6 *req;
596 	struct inpcb *inp = lctx->inp;
597 
598 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
599 	if (wr == NULL) {
600 		log(LOG_ERR, "%s: allocation failure", __func__);
601 		return (ENOMEM);
602 	}
603 	req = wrtod(wr);
604 
605 	INIT_TP_WR(req, 0);
606 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
607 	req->local_port = inp->inp_lport;
608 	req->peer_port = 0;
609 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
610 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
611 	req->peer_ip_hi = 0;
612 	req->peer_ip_lo = 0;
613 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
614 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
615 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
616 
617 	t4_wrq_tx(sc, wr);
618 	return (0);
619 }
620 
621 static int
destroy_server(struct adapter * sc,struct listen_ctx * lctx)622 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
623 {
624 	struct wrqe *wr;
625 	struct cpl_close_listsvr_req *req;
626 
627 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
628 	if (wr == NULL) {
629 		/* XXX */
630 		panic("%s: allocation failure.", __func__);
631 	}
632 	req = wrtod(wr);
633 
634 	INIT_TP_WR(req, 0);
635 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
636 	    lctx->stid));
637 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
638 	req->rsvd = htobe16(0);
639 
640 	t4_wrq_tx(sc, wr);
641 	return (0);
642 }
643 
644 /*
645  * Start a listening server by sending a passive open request to HW.
646  *
647  * Can't take adapter lock here and access to sc->flags,
648  * sc->offload_map, if_capenable are all race prone.
649  */
650 int
t4_listen_start(struct toedev * tod,struct tcpcb * tp)651 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
652 {
653 	struct adapter *sc = tod->tod_softc;
654 	struct vi_info *vi;
655 	struct port_info *pi;
656 	struct inpcb *inp = tptoinpcb(tp);
657 	struct listen_ctx *lctx;
658 	int i, rc, v;
659 	struct offload_settings settings;
660 
661 	INP_WLOCK_ASSERT(inp);
662 
663 	rw_rlock(&sc->policy_lock);
664 	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
665 	    EVL_MAKETAG(0xfff, 0, 0), inp);
666 	rw_runlock(&sc->policy_lock);
667 	if (!settings.offload)
668 		return (0);
669 
670 	/* Don't start a hardware listener for any loopback address. */
671 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
672 		return (0);
673 	if (!(inp->inp_vflag & INP_IPV6) &&
674 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
675 		return (0);
676 	if (sc->flags & KERN_TLS_ON)
677 		return (0);
678 #if 0
679 	ADAPTER_LOCK(sc);
680 	if (IS_BUSY(sc)) {
681 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
682 		    __func__, device_get_nameunit(sc->dev));
683 		goto done;
684 	}
685 
686 	KASSERT(uld_active(sc, ULD_TOM),
687 	    ("%s: TOM not initialized", __func__));
688 #endif
689 
690 	/*
691 	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
692 	 * such VI's queues to send the passive open and receive the reply to
693 	 * it.
694 	 *
695 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
696 	 * then reject any attempt to bring down such a port (and maybe reject
697 	 * attempts to disable IFCAP_TOE on that port too?).
698 	 */
699 	for_each_port(sc, i) {
700 		pi = sc->port[i];
701 		for_each_vi(pi, v, vi) {
702 			if (vi->flags & VI_INIT_DONE &&
703 			    if_getcapenable(vi->ifp) & IFCAP_TOE)
704 				goto found;
705 		}
706 	}
707 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
708 found:
709 
710 	if (listen_hash_find(sc, inp) != NULL)
711 		goto done;	/* already setup */
712 
713 	lctx = alloc_lctx(sc, inp, vi);
714 	if (lctx == NULL) {
715 		log(LOG_ERR,
716 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
717 		    __func__, device_get_nameunit(sc->dev));
718 		goto done;
719 	}
720 	listen_hash_add(sc, lctx);
721 
722 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
723 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
724 	    inp->inp_vflag);
725 
726 	if (inp->inp_vflag & INP_IPV6)
727 		rc = create_server6(sc, lctx);
728 	else
729 		rc = create_server(sc, lctx);
730 	if (rc != 0) {
731 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
732 		    __func__, device_get_nameunit(sc->dev), rc);
733 		(void) listen_hash_del(sc, inp);
734 		inp = release_lctx(sc, lctx);
735 		/* can't be freed, host stack has a reference */
736 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
737 		goto done;
738 	}
739 	lctx->flags |= LCTX_RPL_PENDING;
740 done:
741 #if 0
742 	ADAPTER_UNLOCK(sc);
743 #endif
744 	return (0);
745 }
746 
747 int
t4_listen_stop(struct toedev * tod,struct tcpcb * tp)748 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
749 {
750 	struct listen_ctx *lctx;
751 	struct adapter *sc = tod->tod_softc;
752 	struct inpcb *inp = tptoinpcb(tp);
753 
754 	INP_WLOCK_ASSERT(inp);
755 
756 	lctx = listen_hash_del(sc, inp);
757 	if (lctx == NULL)
758 		return (ENOENT);	/* no hardware listener for this inp */
759 
760 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
761 	    lctx, lctx->flags);
762 
763 	/*
764 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
765 	 * arrive and clean up when it does.
766 	 */
767 	if (lctx->flags & LCTX_RPL_PENDING) {
768 		return (EINPROGRESS);
769 	}
770 
771 	if (lctx->flags & LCTX_SETUP_IN_HW)
772 		destroy_server(sc, lctx);
773 	else
774 		inp = release_lctx(sc, lctx);
775 	return (0);
776 }
777 
778 static inline struct synq_entry *
alloc_synqe(struct adapter * sc,struct listen_ctx * lctx,int flags)779 alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags)
780 {
781 	struct synq_entry *synqe;
782 
783 	INP_RLOCK_ASSERT(lctx->inp);
784 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
785 
786 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
787 	if (__predict_true(synqe != NULL)) {
788 		synqe->flags = TPF_SYNQE;
789 		synqe->incarnation = sc->incarnation;
790 		refcount_init(&synqe->refcnt, 1);
791 		synqe->lctx = lctx;
792 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
793 		synqe->syn = NULL;
794 	}
795 
796 	return (synqe);
797 }
798 
799 static inline void
hold_synqe(struct synq_entry * synqe)800 hold_synqe(struct synq_entry *synqe)
801 {
802 
803 	refcount_acquire(&synqe->refcnt);
804 }
805 
806 static inline struct inpcb *
release_synqe(struct adapter * sc,struct synq_entry * synqe)807 release_synqe(struct adapter *sc, struct synq_entry *synqe)
808 {
809 	struct inpcb *inp;
810 
811 	MPASS(synqe->flags & TPF_SYNQE);
812 	MPASS(synqe->lctx != NULL);
813 
814 	inp = synqe->lctx->inp;
815 	MPASS(inp != NULL);
816 	INP_WLOCK_ASSERT(inp);
817 
818 	if (refcount_release(&synqe->refcnt)) {
819 		inp = release_lctx(sc, synqe->lctx);
820 		m_freem(synqe->syn);
821 		free(synqe, M_CXGBE);
822 	}
823 
824 	return (inp);
825 }
826 
827 void
t4_syncache_added(struct toedev * tod __unused,void * arg)828 t4_syncache_added(struct toedev *tod __unused, void *arg)
829 {
830 	struct synq_entry *synqe = arg;
831 
832 	hold_synqe(synqe);
833 }
834 
835 void
t4_syncache_removed(struct toedev * tod,void * arg)836 t4_syncache_removed(struct toedev *tod, void *arg)
837 {
838 	struct adapter *sc = tod->tod_softc;
839 	struct synq_entry *synqe = arg;
840 	struct inpcb *inp = synqe->lctx->inp;
841 
842 	/*
843 	 * XXX: this is a LOR but harmless when running from the softclock.
844 	 */
845 	INP_WLOCK(inp);
846 	inp = release_synqe(sc, synqe);
847 	if (inp != NULL)
848 		INP_WUNLOCK(inp);
849 }
850 
851 int
t4_syncache_respond(struct toedev * tod,void * arg,struct mbuf * m)852 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
853 {
854 	struct synq_entry *synqe = arg;
855 
856 	if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
857 		struct tcpopt to;
858 		struct ip *ip = mtod(m, struct ip *);
859 		struct tcphdr *th;
860 
861 		if (ip->ip_v == IPVERSION)
862 			th = (void *)(ip + 1);
863 		else
864 			th = (void *)((struct ip6_hdr *)ip + 1);
865 		bzero(&to, sizeof(to));
866 		tcp_dooptions(&to, (void *)(th + 1),
867 		    (th->th_off << 2) - sizeof(*th), TO_SYN);
868 
869 		/* save these for later */
870 		synqe->iss = be32toh(th->th_seq);
871 		synqe->irs = be32toh(th->th_ack) - 1;
872 		synqe->ts = to.to_tsval;
873 	}
874 
875 	m_freem(m);	/* don't need this any more */
876 	return (0);
877 }
878 
879 static int
do_pass_open_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)880 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
881     struct mbuf *m)
882 {
883 	struct adapter *sc = iq->adapter;
884 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
885 	int stid = GET_TID(cpl);
886 	unsigned int status = cpl->status;
887 	struct listen_ctx *lctx = lookup_stid(sc, stid);
888 	struct inpcb *inp = lctx->inp;
889 	struct tcpcb *tp = intotcpcb(inp);
890 #ifdef INVARIANTS
891 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
892 #endif
893 
894 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
895 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
896 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
897 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
898 
899 	INP_WLOCK(inp);
900 
901 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
902 	    __func__, stid, status, lctx->flags);
903 
904 	lctx->flags &= ~LCTX_RPL_PENDING;
905 	if (status == CPL_ERR_NONE)
906 		lctx->flags |= LCTX_SETUP_IN_HW;
907 	else
908 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
909 
910 #ifdef INVARIANTS
911 	/*
912 	 * If the inp has been dropped (listening socket closed) then
913 	 * listen_stop must have run and taken the inp out of the hash.
914 	 */
915 	if (tp->t_flags & TF_DISCONNECTED) {
916 		KASSERT(listen_hash_del(sc, inp) == NULL,
917 		    ("%s: inp %p still in listen hash", __func__, inp));
918 	}
919 #endif
920 
921 	if (tp->t_flags & TF_DISCONNECTED && status != CPL_ERR_NONE) {
922 		if (release_lctx(sc, lctx) != NULL)
923 			INP_WUNLOCK(inp);
924 		return (status);
925 	}
926 
927 	/*
928 	 * Listening socket stopped listening earlier and now the chip tells us
929 	 * it has started the hardware listener.  Stop it; the lctx will be
930 	 * released in do_close_server_rpl.
931 	 */
932 	if (tp->t_flags & TF_DISCONNECTED) {
933 		destroy_server(sc, lctx);
934 		INP_WUNLOCK(inp);
935 		return (status);
936 	}
937 
938 	/*
939 	 * Failed to start hardware listener.  Take inp out of the hash and
940 	 * release our reference on it.  An error message has been logged
941 	 * already.
942 	 */
943 	if (status != CPL_ERR_NONE) {
944 		listen_hash_del(sc, inp);
945 		if (release_lctx(sc, lctx) != NULL)
946 			INP_WUNLOCK(inp);
947 		return (status);
948 	}
949 
950 	/* hardware listener open for business */
951 
952 	INP_WUNLOCK(inp);
953 	return (status);
954 }
955 
956 static int
do_close_server_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)957 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
958     struct mbuf *m)
959 {
960 	struct adapter *sc = iq->adapter;
961 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
962 	int stid = GET_TID(cpl);
963 	unsigned int status = cpl->status;
964 	struct listen_ctx *lctx = lookup_stid(sc, stid);
965 	struct inpcb *inp = lctx->inp;
966 #ifdef INVARIANTS
967 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
968 #endif
969 
970 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
971 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
972 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
973 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
974 
975 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
976 
977 	if (status != CPL_ERR_NONE) {
978 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
979 		    __func__, status, stid);
980 		return (status);
981 	}
982 
983 	INP_WLOCK(inp);
984 	inp = release_lctx(sc, lctx);
985 	if (inp != NULL)
986 		INP_WUNLOCK(inp);
987 
988 	return (status);
989 }
990 
991 static void
done_with_synqe(struct adapter * sc,struct synq_entry * synqe)992 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
993 {
994 	struct tom_data *td = sc->tom_softc;
995 	struct listen_ctx *lctx = synqe->lctx;
996 	struct inpcb *inp = lctx->inp;
997 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
998 	int ntids;
999 
1000 	INP_WLOCK_ASSERT(inp);
1001 
1002 	if (synqe->tid != -1) {
1003 		ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
1004 		remove_tid(sc, synqe->tid, ntids);
1005 		mtx_lock(&td->toep_list_lock);
1006 		TAILQ_REMOVE(&td->synqe_list, synqe, link);
1007 		mtx_unlock(&td->toep_list_lock);
1008 		release_tid(sc, synqe->tid, lctx->ctrlq);
1009 	}
1010 	t4_l2t_release(e);
1011 	inp = release_synqe(sc, synqe);
1012 	if (inp)
1013 		INP_WUNLOCK(inp);
1014 }
1015 
1016 void
synack_failure_cleanup(struct adapter * sc,struct synq_entry * synqe)1017 synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe)
1018 {
1019 	INP_WLOCK(synqe->lctx->inp);
1020 	done_with_synqe(sc, synqe);
1021 }
1022 
1023 int
do_abort_req_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1024 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
1025     struct mbuf *m)
1026 {
1027 	struct adapter *sc = iq->adapter;
1028 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1029 	unsigned int tid = GET_TID(cpl);
1030 	struct synq_entry *synqe = lookup_tid(sc, tid);
1031 	struct listen_ctx *lctx = synqe->lctx;
1032 	struct inpcb *inp = lctx->inp;
1033 	struct sge_ofld_txq *ofld_txq;
1034 #ifdef INVARIANTS
1035 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1036 #endif
1037 
1038 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
1039 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1040 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1041 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
1042 
1043 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
1044 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
1045 
1046 	if (negative_advice(cpl->status))
1047 		return (0);	/* Ignore negative advice */
1048 
1049 	INP_WLOCK(inp);
1050 
1051 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
1052 
1053 	if (!(synqe->flags & TPF_FLOWC_WR_SENT))
1054 		send_flowc_wr_synqe(sc, synqe);
1055 
1056 	/*
1057 	 * If we'd initiated an abort earlier the reply to it is responsible for
1058 	 * cleaning up resources.  Otherwise we tear everything down right here
1059 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
1060 	 */
1061 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
1062 		INP_WUNLOCK(inp);
1063 		goto done;
1064 	}
1065 
1066 	done_with_synqe(sc, synqe);
1067 	/* inp lock released by done_with_synqe */
1068 done:
1069 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1070 	return (0);
1071 }
1072 
1073 int
do_abort_rpl_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1074 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
1075     struct mbuf *m)
1076 {
1077 	struct adapter *sc = iq->adapter;
1078 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1079 	unsigned int tid = GET_TID(cpl);
1080 	struct synq_entry *synqe = lookup_tid(sc, tid);
1081 	struct listen_ctx *lctx = synqe->lctx;
1082 	struct inpcb *inp = lctx->inp;
1083 #ifdef INVARIANTS
1084 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1085 #endif
1086 
1087 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
1088 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1089 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1090 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
1091 
1092 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
1093 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
1094 
1095 	INP_WLOCK(inp);
1096 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1097 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
1098 	    __func__, synqe, synqe->flags));
1099 
1100 	done_with_synqe(sc, synqe);
1101 	/* inp lock released by done_with_synqe */
1102 
1103 	return (0);
1104 }
1105 
1106 void
t4_offload_socket(struct toedev * tod,void * arg,struct socket * so)1107 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
1108 {
1109 	struct adapter *sc = tod->tod_softc;
1110 	struct tom_data *td = sc->tom_softc;
1111 	struct synq_entry *synqe = arg;
1112 	struct inpcb *inp = sotoinpcb(so);
1113 	struct toepcb *toep = synqe->toep;
1114 
1115 	NET_EPOCH_ASSERT();	/* prevents bad race with accept() */
1116 	INP_WLOCK_ASSERT(inp);
1117 	KASSERT(synqe->flags & TPF_SYNQE,
1118 	    ("%s: %p not a synq_entry?", __func__, arg));
1119 	MPASS(toep->tid == synqe->tid);
1120 
1121 	offload_socket(so, toep);
1122 	make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
1123 	toep->flags |= TPF_CPL_PENDING;
1124 	update_tid(sc, synqe->tid, toep);
1125 	synqe->flags |= TPF_SYNQE_EXPANDED;
1126 	mtx_lock(&td->toep_list_lock);
1127 	/* Remove synqe from its list and add the TOE PCB to the active list. */
1128 	TAILQ_REMOVE(&td->synqe_list, synqe, link);
1129 	TAILQ_INSERT_TAIL(&td->toep_list, toep, link);
1130 	toep->flags |= TPF_IN_TOEP_LIST;
1131 	mtx_unlock(&td->toep_list_lock);
1132 	inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
1133 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
1134 	inp->inp_flowid = synqe->rss_hash;
1135 }
1136 
1137 static void
t4opt_to_tcpopt(const struct tcp_options * t4opt,struct tcpopt * to)1138 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
1139 {
1140 	bzero(to, sizeof(*to));
1141 
1142 	if (t4opt->mss) {
1143 		to->to_flags |= TOF_MSS;
1144 		to->to_mss = be16toh(t4opt->mss);
1145 	}
1146 
1147 	if (t4opt->wsf > 0 && t4opt->wsf < 15) {
1148 		to->to_flags |= TOF_SCALE;
1149 		to->to_wscale = t4opt->wsf;
1150 	}
1151 
1152 	if (t4opt->tstamp)
1153 		to->to_flags |= TOF_TS;
1154 
1155 	if (t4opt->sack)
1156 		to->to_flags |= TOF_SACKPERM;
1157 }
1158 
1159 static bool
encapsulated_syn(struct adapter * sc,const struct cpl_pass_accept_req * cpl)1160 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
1161 {
1162 	u_int hlen = be32toh(cpl->hdr_len);
1163 
1164 	if (chip_id(sc) >= CHELSIO_T6)
1165 		return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1166 	else
1167 		return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
1168 }
1169 
1170 static void
pass_accept_req_to_protohdrs(struct adapter * sc,const struct mbuf * m,struct in_conninfo * inc,struct tcphdr * th,uint8_t * iptos)1171 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1172     struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
1173 {
1174 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1175 	const struct ether_header *eh;
1176 	unsigned int hlen = be32toh(cpl->hdr_len);
1177 	uintptr_t l3hdr;
1178 	const struct tcphdr *tcp;
1179 
1180 	eh = (const void *)(cpl + 1);
1181 	if (chip_id(sc) >= CHELSIO_T6) {
1182 		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1183 		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1184 	} else {
1185 		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1186 		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1187 	}
1188 
1189 	/* extract TOS (DiffServ + ECN) byte for AccECN */
1190 	if (iptos) {
1191 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1192 			const struct ip *ip = (const void *)l3hdr;
1193 			*iptos = ip->ip_tos;
1194 		}
1195 #ifdef INET6
1196 		else
1197 		if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
1198 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1199 			*iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1200 		}
1201 #endif /* INET */
1202 	}
1203 
1204 	if (inc) {
1205 		bzero(inc, sizeof(*inc));
1206 		inc->inc_fport = tcp->th_sport;
1207 		inc->inc_lport = tcp->th_dport;
1208 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1209 			const struct ip *ip = (const void *)l3hdr;
1210 
1211 			inc->inc_faddr = ip->ip_src;
1212 			inc->inc_laddr = ip->ip_dst;
1213 		} else {
1214 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
1215 
1216 			inc->inc_flags |= INC_ISIPV6;
1217 			inc->inc6_faddr = ip6->ip6_src;
1218 			inc->inc6_laddr = ip6->ip6_dst;
1219 		}
1220 	}
1221 
1222 	if (th) {
1223 		bcopy(tcp, th, sizeof(*th));
1224 		tcp_fields_to_host(th);		/* just like tcp_input */
1225 	}
1226 }
1227 
1228 static struct l2t_entry *
get_l2te_for_nexthop(struct port_info * pi,if_t ifp,struct in_conninfo * inc)1229 get_l2te_for_nexthop(struct port_info *pi, if_t ifp,
1230     struct in_conninfo *inc)
1231 {
1232 	struct l2t_entry *e;
1233 	struct sockaddr_in6 sin6;
1234 	struct sockaddr *dst = (void *)&sin6;
1235 	struct nhop_object *nh;
1236 
1237 	if (inc->inc_flags & INC_ISIPV6) {
1238 		bzero(dst, sizeof(struct sockaddr_in6));
1239 		dst->sa_len = sizeof(struct sockaddr_in6);
1240 		dst->sa_family = AF_INET6;
1241 
1242 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1243 			/* no need for route lookup */
1244 			e = t4_l2t_get(pi, ifp, dst);
1245 			return (e);
1246 		}
1247 
1248 		nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
1249 		if (nh == NULL)
1250 			return (NULL);
1251 		if (nh->nh_ifp != ifp)
1252 			return (NULL);
1253 		if (nh->nh_flags & NHF_GATEWAY)
1254 			((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
1255 		else
1256 			((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1257 	} else {
1258 		dst->sa_len = sizeof(struct sockaddr_in);
1259 		dst->sa_family = AF_INET;
1260 
1261 		nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
1262 		if (nh == NULL)
1263 			return (NULL);
1264 		if (nh->nh_ifp != ifp)
1265 			return (NULL);
1266 		if (nh->nh_flags & NHF_GATEWAY)
1267 			if (nh->gw_sa.sa_family == AF_INET)
1268 				((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
1269 			else
1270 				*((struct sockaddr_in6 *)dst) = nh->gw6_sa;
1271 		else
1272 			((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1273 	}
1274 
1275 	e = t4_l2t_get(pi, ifp, dst);
1276 	return (e);
1277 }
1278 
1279 static int
send_synack(struct adapter * sc,struct synq_entry * synqe,uint64_t opt0,uint32_t opt2,int tid)1280 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
1281     uint32_t opt2, int tid)
1282 {
1283 	struct wrqe *wr;
1284 	struct cpl_pass_accept_rpl *rpl;
1285 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
1286 
1287 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1288 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
1289 	if (wr == NULL)
1290 		return (ENOMEM);
1291 	rpl = wrtod(wr);
1292 
1293 	if (is_t4(sc))
1294 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1295 	else {
1296 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1297 
1298 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1299 		rpl5->iss = htobe32(synqe->iss);
1300 	}
1301 	rpl->opt0 = opt0;
1302 	rpl->opt2 = opt2;
1303 
1304 	return (t4_l2t_send(sc, wr, e));
1305 }
1306 
1307 #define REJECT_PASS_ACCEPT_REQ(tunnel)	do { \
1308 	if (!tunnel) { \
1309 		m_freem(m); \
1310 		m = NULL; \
1311 	} \
1312 	reject_reason = __LINE__; \
1313 	goto reject; \
1314 } while (0)
1315 
1316 /*
1317  * The context associated with a tid entry via insert_tid could be a synq_entry
1318  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
1319  */
1320 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1321 
1322 /*
1323  * Incoming SYN on a listening socket.
1324  *
1325  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1326  * etc.
1327  */
1328 static int
do_pass_accept_req(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1329 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1330     struct mbuf *m)
1331 {
1332 	struct adapter *sc = iq->adapter;
1333 	struct tom_data *td = sc->tom_softc;
1334 	struct toedev *tod;
1335 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1336 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1337 	unsigned int tid = GET_TID(cpl);
1338 	struct listen_ctx *lctx = lookup_stid(sc, stid);
1339 	struct inpcb *inp;
1340 	struct tcpcb *tp;
1341 	struct socket *so;
1342 	struct in_conninfo inc;
1343 	struct tcphdr th;
1344 	struct tcpopt to;
1345 	struct port_info *pi;
1346 	struct vi_info *vi;
1347 	if_t hw_ifp, ifp;
1348 	struct l2t_entry *e = NULL;
1349 	struct synq_entry *synqe = NULL;
1350 	int reject_reason, v, ntids;
1351 	uint16_t vid, l2info;
1352 	struct epoch_tracker et;
1353 #ifdef INVARIANTS
1354 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1355 #endif
1356 	struct offload_settings settings;
1357 	uint8_t iptos;
1358 
1359 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1360 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1361 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1362 
1363 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1364 	    lctx);
1365 
1366 	/*
1367 	 * Figure out the port the SYN arrived on.  We'll look for an exact VI
1368 	 * match in a bit but in case we don't find any we'll use the main VI as
1369 	 * the incoming ifnet.
1370 	 */
1371 	l2info = be16toh(cpl->l2info);
1372 	pi = sc->port[G_SYN_INTF(l2info)];
1373 	hw_ifp = pi->vi[0].ifp;
1374 	m->m_pkthdr.rcvif = hw_ifp;
1375 
1376 	CURVNET_SET(lctx->vnet);	/* before any potential REJECT */
1377 
1378 	/*
1379 	 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
1380 	 * also hit the listener.  We don't want to offload those.
1381 	 */
1382 	if (encapsulated_syn(sc, cpl)) {
1383 		REJECT_PASS_ACCEPT_REQ(true);
1384 	}
1385 
1386 	/*
1387 	 * Use the MAC index to lookup the associated VI.  If this SYN didn't
1388 	 * match a perfect MAC filter, punt.
1389 	 */
1390 	if (!(l2info & F_SYN_XACT_MATCH)) {
1391 		REJECT_PASS_ACCEPT_REQ(true);
1392 	}
1393 	for_each_vi(pi, v, vi) {
1394 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
1395 			goto found;
1396 	}
1397 	REJECT_PASS_ACCEPT_REQ(true);
1398 found:
1399 	hw_ifp = vi->ifp;	/* the cxgbe ifnet */
1400 	m->m_pkthdr.rcvif = hw_ifp;
1401 	tod = TOEDEV(hw_ifp);
1402 
1403 	/*
1404 	 * Don't offload if the peer requested a TCP option that's not known to
1405 	 * the silicon.  Send the SYN to the kernel instead.
1406 	 */
1407 	if (__predict_false(cpl->tcpopt.unknown))
1408 		REJECT_PASS_ACCEPT_REQ(true);
1409 
1410 	/*
1411 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1412 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
1413 	 * doesn't match anything on this interface.
1414 	 *
1415 	 * XXX: lagg support, lagg + vlan support.
1416 	 */
1417 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1418 	if (vid != 0xfff && vid != 0) {
1419 		ifp = VLAN_DEVAT(hw_ifp, vid);
1420 		if (ifp == NULL)
1421 			REJECT_PASS_ACCEPT_REQ(true);
1422 	} else
1423 		ifp = hw_ifp;
1424 
1425 	/*
1426 	 * Don't offload if the ifnet that the SYN came in on is not in the same
1427 	 * vnet as the listening socket.
1428 	 */
1429 	if (lctx->vnet != if_getvnet(ifp))
1430 		REJECT_PASS_ACCEPT_REQ(true);
1431 
1432 	pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
1433 	if (inc.inc_flags & INC_ISIPV6) {
1434 
1435 		/* Don't offload if the ifcap isn't enabled */
1436 		if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0)
1437 			REJECT_PASS_ACCEPT_REQ(true);
1438 
1439 		/*
1440 		 * SYN must be directed to an IP6 address on this ifnet.  This
1441 		 * is more restrictive than in6_localip.
1442 		 */
1443 		NET_EPOCH_ENTER(et);
1444 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
1445 			NET_EPOCH_EXIT(et);
1446 			REJECT_PASS_ACCEPT_REQ(true);
1447 		}
1448 
1449 		ntids = 2;
1450 	} else {
1451 
1452 		/* Don't offload if the ifcap isn't enabled */
1453 		if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0)
1454 			REJECT_PASS_ACCEPT_REQ(true);
1455 
1456 		/*
1457 		 * SYN must be directed to an IP address on this ifnet.  This
1458 		 * is more restrictive than in_localip.
1459 		 */
1460 		NET_EPOCH_ENTER(et);
1461 		if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
1462 			NET_EPOCH_EXIT(et);
1463 			REJECT_PASS_ACCEPT_REQ(true);
1464 		}
1465 
1466 		ntids = 1;
1467 	}
1468 
1469 	e = get_l2te_for_nexthop(pi, ifp, &inc);
1470 	if (e == NULL) {
1471 		NET_EPOCH_EXIT(et);
1472 		REJECT_PASS_ACCEPT_REQ(true);
1473 	}
1474 
1475 	/* Don't offload if the 4-tuple is already in use */
1476 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1477 		NET_EPOCH_EXIT(et);
1478 		REJECT_PASS_ACCEPT_REQ(false);
1479 	}
1480 
1481 	inp = lctx->inp;		/* listening socket, not owned by TOE */
1482 	tp = intotcpcb(inp);
1483 	INP_RLOCK(inp);
1484 
1485 	/* Don't offload if the listening socket has closed */
1486 	if (__predict_false(tp->t_flags & TF_DISCONNECTED)) {
1487 		INP_RUNLOCK(inp);
1488 		NET_EPOCH_EXIT(et);
1489 		REJECT_PASS_ACCEPT_REQ(false);
1490 	}
1491 	so = inp->inp_socket;
1492 	rw_rlock(&sc->policy_lock);
1493 	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
1494 	    EVL_MAKETAG(0xfff, 0, 0), inp);
1495 	rw_runlock(&sc->policy_lock);
1496 	if (!settings.offload) {
1497 		INP_RUNLOCK(inp);
1498 		NET_EPOCH_EXIT(et);
1499 		REJECT_PASS_ACCEPT_REQ(true);	/* Rejected by COP. */
1500 	}
1501 
1502 	synqe = alloc_synqe(sc, lctx, M_NOWAIT);
1503 	if (synqe == NULL) {
1504 		INP_RUNLOCK(inp);
1505 		NET_EPOCH_EXIT(et);
1506 		REJECT_PASS_ACCEPT_REQ(true);
1507 	}
1508 	MPASS(rss->hash_type == RSS_HASH_TCP);
1509 	synqe->rss_hash = be32toh(rss->hash_val);
1510 	atomic_store_int(&synqe->ok_to_respond, 0);
1511 
1512 	init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
1513 	    &synqe->params);
1514 	if (sc->params.tid_qid_sel_mask != 0)
1515 		update_tid_qid_sel(vi, &synqe->params, tid);
1516 
1517 	/*
1518 	 * If all goes well t4_syncache_respond will get called during
1519 	 * syncache_add.  Note that syncache_add releases the pcb lock.
1520 	 */
1521 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
1522 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
1523 
1524 	if (atomic_load_int(&synqe->ok_to_respond) > 0) {
1525 		uint64_t opt0;
1526 		uint32_t opt2;
1527 
1528 		opt0 = calc_options0(vi, &synqe->params);
1529 		opt2 = calc_options2(vi, &synqe->params);
1530 
1531 		insert_tid(sc, tid, synqe, ntids);
1532 		synqe->tid = tid;
1533 		synqe->syn = m;
1534 		m = NULL;
1535 		mtx_lock(&td->toep_list_lock);
1536 		TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link);
1537 		mtx_unlock(&td->toep_list_lock);
1538 
1539 		if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
1540 			remove_tid(sc, tid, ntids);
1541 			m = synqe->syn;
1542 			synqe->syn = NULL;
1543 			mtx_lock(&td->toep_list_lock);
1544 			TAILQ_REMOVE(&td->synqe_list, synqe, link);
1545 			mtx_unlock(&td->toep_list_lock);
1546 			NET_EPOCH_EXIT(et);
1547 			REJECT_PASS_ACCEPT_REQ(true);
1548 		}
1549 		CTR6(KTR_CXGBE,
1550 		    "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
1551 		    __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
1552 	} else {
1553 		NET_EPOCH_EXIT(et);
1554 		REJECT_PASS_ACCEPT_REQ(false);
1555 	}
1556 
1557 	NET_EPOCH_EXIT(et);
1558 	CURVNET_RESTORE();
1559 	return (0);
1560 reject:
1561 	CURVNET_RESTORE();
1562 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1563 	    reject_reason);
1564 
1565 	if (e)
1566 		t4_l2t_release(e);
1567 	release_tid(sc, tid, lctx->ctrlq);
1568 	if (synqe) {
1569 		inp = synqe->lctx->inp;
1570 		INP_WLOCK(inp);
1571 		inp = release_synqe(sc, synqe);
1572 		if (inp)
1573 			INP_WUNLOCK(inp);
1574 	}
1575 
1576 	if (m) {
1577 		/*
1578 		 * The connection request hit a TOE listener but is being passed
1579 		 * on to the kernel sw stack instead of getting offloaded.
1580 		 */
1581 		m_adj(m, sizeof(*cpl));
1582 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1583 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1584 		m->m_pkthdr.csum_data = 0xffff;
1585 		if_input(hw_ifp, m);
1586 	}
1587 
1588 	return (reject_reason);
1589 }
1590 
1591 static void
synqe_to_protohdrs(struct adapter * sc,struct synq_entry * synqe,const struct cpl_pass_establish * cpl,struct in_conninfo * inc,struct tcphdr * th,struct tcpopt * to)1592 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1593     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1594     struct tcphdr *th, struct tcpopt *to)
1595 {
1596 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1597 	uint8_t iptos;
1598 
1599 	/* start off with the original SYN */
1600 	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
1601 
1602 	/* modify parts to make it look like the ACK to our SYN|ACK */
1603 	tcp_set_flags(th, TH_ACK);
1604 	th->th_ack = synqe->iss + 1;
1605 	th->th_seq = be32toh(cpl->rcv_isn);
1606 	bzero(to, sizeof(*to));
1607 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
1608 		to->to_flags |= TOF_TS;
1609 		to->to_tsecr = synqe->ts;
1610 	}
1611 }
1612 
1613 static int
do_pass_establish(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1614 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1615     struct mbuf *m)
1616 {
1617 	struct adapter *sc = iq->adapter;
1618 	struct vi_info *vi;
1619 	if_t ifp;
1620 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1621 #if defined(KTR) || defined(INVARIANTS)
1622 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1623 #endif
1624 	unsigned int tid = GET_TID(cpl);
1625 	struct synq_entry *synqe = lookup_tid(sc, tid);
1626 	struct listen_ctx *lctx = synqe->lctx;
1627 	struct inpcb *inp = lctx->inp, *new_inp;
1628 	struct tcpcb *tp = intotcpcb(inp);
1629 	struct socket *so;
1630 	struct tcphdr th;
1631 	struct tcpopt to;
1632 	struct in_conninfo inc;
1633 	struct toepcb *toep;
1634 	struct epoch_tracker et;
1635 	int rstreason;
1636 #ifdef INVARIANTS
1637 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1638 #endif
1639 
1640 	KASSERT(opcode == CPL_PASS_ESTABLISH,
1641 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1642 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1643 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1644 	KASSERT(synqe->flags & TPF_SYNQE,
1645 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1646 
1647 	CURVNET_SET(lctx->vnet);
1648 	NET_EPOCH_ENTER(et);	/* for syncache_expand */
1649 	INP_WLOCK(inp);
1650 
1651 	CTR6(KTR_CXGBE,
1652 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1653 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1654 
1655 	ifp = synqe->syn->m_pkthdr.rcvif;
1656 	vi = if_getsoftc(ifp);
1657 	KASSERT(vi->adapter == sc,
1658 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1659 
1660 	if (__predict_false(tp->t_flags & TF_DISCONNECTED)) {
1661 reset:
1662 		send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
1663 		INP_WUNLOCK(inp);
1664 		NET_EPOCH_EXIT(et);
1665 		CURVNET_RESTORE();
1666 		return (0);
1667 	}
1668 
1669 	KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1670 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
1671 	    synqe->params.rxq_idx,
1672 	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1673 
1674 	toep = alloc_toepcb(vi, M_NOWAIT);
1675 	if (toep == NULL)
1676 		goto reset;
1677 	toep->tid = tid;
1678 	toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
1679 	toep->vnet = lctx->vnet;
1680 	bcopy(&synqe->params, &toep->params, sizeof(toep->params));
1681 	init_toepcb(vi, toep);
1682 
1683 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
1684 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
1685 	synqe->tcp_opt = cpl->tcp_opt;
1686 	synqe->toep = toep;
1687 
1688 	/* Come up with something that syncache_expand should be ok with. */
1689 	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1690 	if (inc.inc_flags & INC_ISIPV6) {
1691 		if (lctx->ce == NULL) {
1692 			toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
1693 			if (toep->ce == NULL) {
1694 				free_toepcb(toep);
1695 				goto reset;	/* RST without a CLIP entry? */
1696 			}
1697 		} else {
1698 			t4_hold_clip_entry(sc, lctx->ce);
1699 			toep->ce = lctx->ce;
1700 		}
1701 	}
1702 	so = inp->inp_socket;
1703 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1704 
1705 	rstreason = toe_syncache_expand(&inc, &to, &th, &so);
1706 	if (rstreason < 0) {
1707 		free_toepcb(toep);
1708 		send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
1709 		INP_WUNLOCK(inp);
1710 		NET_EPOCH_EXIT(et);
1711 		CURVNET_RESTORE();
1712 		return (0);
1713 	} else if (rstreason == 0 || so == NULL) {
1714 		free_toepcb(toep);
1715 		goto reset;
1716 	}
1717 
1718 	/* New connection inpcb is already locked by syncache_expand(). */
1719 	new_inp = sotoinpcb(so);
1720 	INP_WLOCK_ASSERT(new_inp);
1721 	MPASS(so->so_vnet == lctx->vnet);
1722 
1723 	/*
1724 	 * This is for expansion from syncookies.
1725 	 *
1726 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1727 	 * anyone accept'ing a connection before we've installed our hooks, but
1728 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1729 	 */
1730 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1731 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1732 		t4_offload_socket(TOEDEV(ifp), synqe, so);
1733 	}
1734 
1735 	INP_WUNLOCK(new_inp);
1736 
1737 	/* Done with the synqe */
1738 	inp = release_synqe(sc, synqe);
1739 	if (inp != NULL)
1740 		INP_WUNLOCK(inp);
1741 	NET_EPOCH_EXIT(et);
1742 	CURVNET_RESTORE();
1743 
1744 	return (0);
1745 }
1746 
1747 void
t4_init_listen_cpl_handlers(void)1748 t4_init_listen_cpl_handlers(void)
1749 {
1750 
1751 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1752 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1753 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1754 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1755 }
1756 
1757 void
t4_uninit_listen_cpl_handlers(void)1758 t4_uninit_listen_cpl_handlers(void)
1759 {
1760 
1761 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1762 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1763 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1764 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1765 }
1766 #endif
1767