127f190a3SBjoern A. Zeeb /*- 2fe267a55SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3fe267a55SPedro F. Giffuni * 427f190a3SBjoern A. Zeeb * Copyright (c) 2007, Myricom Inc. 527f190a3SBjoern A. Zeeb * Copyright (c) 2008, Intel Corporation. 662b5b6ecSBjoern A. Zeeb * Copyright (c) 2012 The FreeBSD Foundation 7e936121dSHans Petter Selasky * Copyright (c) 2016 Mellanox Technologies. 827f190a3SBjoern A. Zeeb * All rights reserved. 927f190a3SBjoern A. Zeeb * 1062b5b6ecSBjoern A. Zeeb * Portions of this software were developed by Bjoern Zeeb 1162b5b6ecSBjoern A. Zeeb * under sponsorship from the FreeBSD Foundation. 1262b5b6ecSBjoern A. Zeeb * 1327f190a3SBjoern A. Zeeb * Redistribution and use in source and binary forms, with or without 1427f190a3SBjoern A. Zeeb * modification, are permitted provided that the following conditions 1527f190a3SBjoern A. Zeeb * are met: 1627f190a3SBjoern A. Zeeb * 1. Redistributions of source code must retain the above copyright 1727f190a3SBjoern A. Zeeb * notice, this list of conditions and the following disclaimer. 1827f190a3SBjoern A. Zeeb * 2. Redistributions in binary form must reproduce the above copyright 1927f190a3SBjoern A. Zeeb * notice, this list of conditions and the following disclaimer in the 2027f190a3SBjoern A. Zeeb * documentation and/or other materials provided with the distribution. 2127f190a3SBjoern A. Zeeb * 2227f190a3SBjoern A. Zeeb * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 2327f190a3SBjoern A. Zeeb * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2427f190a3SBjoern A. Zeeb * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2527f190a3SBjoern A. Zeeb * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 2627f190a3SBjoern A. Zeeb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2727f190a3SBjoern A. Zeeb * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2827f190a3SBjoern A. Zeeb * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2927f190a3SBjoern A. Zeeb * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3027f190a3SBjoern A. Zeeb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3127f190a3SBjoern A. Zeeb * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3227f190a3SBjoern A. Zeeb * SUCH DAMAGE. 3327f190a3SBjoern A. Zeeb */ 346c5087a8SJack F Vogel 3562b5b6ecSBjoern A. Zeeb #include <sys/cdefs.h> 3662b5b6ecSBjoern A. Zeeb __FBSDID("$FreeBSD$"); 3762b5b6ecSBjoern A. Zeeb 3862b5b6ecSBjoern A. Zeeb #include "opt_inet.h" 3962b5b6ecSBjoern A. Zeeb #include "opt_inet6.h" 4062b5b6ecSBjoern A. Zeeb 416c5087a8SJack F Vogel #include <sys/param.h> 426c5087a8SJack F Vogel #include <sys/systm.h> 436c5087a8SJack F Vogel #include <sys/kernel.h> 448ec07310SGleb Smirnoff #include <sys/malloc.h> 458ec07310SGleb Smirnoff #include <sys/mbuf.h> 466c5087a8SJack F Vogel #include <sys/socket.h> 47e57b2d0eSRandall Stewart #include <sys/socketvar.h> 48e57b2d0eSRandall Stewart #include <sys/sockbuf.h> 498452c1b3SSepherosa Ziehau #include <sys/sysctl.h> 506c5087a8SJack F Vogel 516c5087a8SJack F Vogel #include <net/if.h> 5262b5b6ecSBjoern A. Zeeb #include <net/if_var.h> 536c5087a8SJack F Vogel #include <net/ethernet.h> 54*69a34e8dSRandall Stewart #include <net/bpf.h> 555fa2656eSBjoern A. Zeeb #include <net/vnet.h> 566c5087a8SJack F Vogel 576c5087a8SJack F Vogel #include <netinet/in_systm.h> 586c5087a8SJack F Vogel #include <netinet/in.h> 5962b5b6ecSBjoern A. Zeeb #include <netinet/ip6.h> 606c5087a8SJack F Vogel #include <netinet/ip.h> 6131bfc56eSBjoern A. Zeeb #include <netinet/ip_var.h> 62e57b2d0eSRandall Stewart #include <netinet/in_pcb.h> 63e57b2d0eSRandall Stewart #include <netinet6/in6_pcb.h> 646c5087a8SJack F Vogel #include <netinet/tcp.h> 65d7fb35d1SSean Bruno #include <netinet/tcp_seq.h> 666c5087a8SJack F Vogel #include <netinet/tcp_lro.h> 678452c1b3SSepherosa Ziehau #include <netinet/tcp_var.h> 68*69a34e8dSRandall Stewart #include <netinet/tcpip.h> 69e57b2d0eSRandall Stewart #include <netinet/tcp_hpts.h> 70e57b2d0eSRandall Stewart #include <netinet/tcp_log_buf.h> 7131bfc56eSBjoern A. Zeeb #include <netinet6/ip6_var.h> 7231bfc56eSBjoern A. Zeeb 736c5087a8SJack F Vogel #include <machine/in_cksum.h> 746c5087a8SJack F Vogel 75e936121dSHans Petter Selasky static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); 766c5087a8SJack F Vogel 7762b5b6ecSBjoern A. Zeeb #define TCP_LRO_UPDATE_CSUM 1 7862b5b6ecSBjoern A. Zeeb #ifndef TCP_LRO_UPDATE_CSUM 7962b5b6ecSBjoern A. Zeeb #define TCP_LRO_INVALID_CSUM 0x0000 8062b5b6ecSBjoern A. Zeeb #endif 816c5087a8SJack F Vogel 826dd38b87SSepherosa Ziehau static void tcp_lro_rx_done(struct lro_ctrl *lc); 8305cde7efSSepherosa Ziehau static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, 8405cde7efSSepherosa Ziehau uint32_t csum, int use_hash); 856dd38b87SSepherosa Ziehau 868452c1b3SSepherosa Ziehau SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 878452c1b3SSepherosa Ziehau "TCP LRO"); 888452c1b3SSepherosa Ziehau 89e57b2d0eSRandall Stewart static long tcplro_stacks_wanting_mbufq = 0; 90e57b2d0eSRandall Stewart counter_u64_t tcp_inp_lro_direct_queue; 91e57b2d0eSRandall Stewart counter_u64_t tcp_inp_lro_wokeup_queue; 92e57b2d0eSRandall Stewart counter_u64_t tcp_inp_lro_compressed; 93e57b2d0eSRandall Stewart counter_u64_t tcp_inp_lro_single_push; 94e57b2d0eSRandall Stewart counter_u64_t tcp_inp_lro_locks_taken; 95e57b2d0eSRandall Stewart counter_u64_t tcp_inp_lro_sack_wake; 96*69a34e8dSRandall Stewart counter_u64_t tcp_extra_mbuf; 97*69a34e8dSRandall Stewart counter_u64_t tcp_would_have_but; 98*69a34e8dSRandall Stewart counter_u64_t tcp_comp_total; 99*69a34e8dSRandall Stewart counter_u64_t tcp_uncomp_total; 100*69a34e8dSRandall Stewart counter_u64_t tcp_csum_hardware; 101*69a34e8dSRandall Stewart counter_u64_t tcp_csum_hardware_w_ph; 102*69a34e8dSRandall Stewart counter_u64_t tcp_csum_software; 103*69a34e8dSRandall Stewart 104e57b2d0eSRandall Stewart 1058452c1b3SSepherosa Ziehau static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; 1068452c1b3SSepherosa Ziehau SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, 1078452c1b3SSepherosa Ziehau CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, 1088452c1b3SSepherosa Ziehau "default number of LRO entries"); 109*69a34e8dSRandall Stewart 110e57b2d0eSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, 111e57b2d0eSRandall Stewart &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); 112e57b2d0eSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, 113e57b2d0eSRandall Stewart &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); 114e57b2d0eSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, 115e57b2d0eSRandall Stewart &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); 116e57b2d0eSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD, 117e57b2d0eSRandall Stewart &tcp_inp_lro_single_push, "Number of lro's sent with single segment"); 118e57b2d0eSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, 119e57b2d0eSRandall Stewart &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); 120e57b2d0eSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD, 121e57b2d0eSRandall Stewart &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin"); 122*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, 123*69a34e8dSRandall Stewart &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); 124*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, 125*69a34e8dSRandall Stewart &tcp_would_have_but, "Number of times we would have had an extra compressed but out of room"); 126*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, 127*69a34e8dSRandall Stewart &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); 128*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, 129*69a34e8dSRandall Stewart &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); 130*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw, CTLFLAG_RD, 131*69a34e8dSRandall Stewart &tcp_csum_hardware, "Number of checksums processed in hardware"); 132*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw_ph, CTLFLAG_RD, 133*69a34e8dSRandall Stewart &tcp_csum_hardware_w_ph, "Number of checksums processed in hardware with pseudo header"); 134*69a34e8dSRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_sw, CTLFLAG_RD, 135*69a34e8dSRandall Stewart &tcp_csum_software, "Number of checksums processed in software"); 136*69a34e8dSRandall Stewart 137*69a34e8dSRandall Stewart 138e57b2d0eSRandall Stewart 139e57b2d0eSRandall Stewart void 140e57b2d0eSRandall Stewart tcp_lro_reg_mbufq(void) 141e57b2d0eSRandall Stewart { 142e57b2d0eSRandall Stewart atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); 143e57b2d0eSRandall Stewart } 144e57b2d0eSRandall Stewart 145e57b2d0eSRandall Stewart void 146e57b2d0eSRandall Stewart tcp_lro_dereg_mbufq(void) 147e57b2d0eSRandall Stewart { 148e57b2d0eSRandall Stewart atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); 149e57b2d0eSRandall Stewart } 1508452c1b3SSepherosa Ziehau 15151e3c20dSSepherosa Ziehau static __inline void 15205cde7efSSepherosa Ziehau tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, 15305cde7efSSepherosa Ziehau struct lro_entry *le) 15451e3c20dSSepherosa Ziehau { 15551e3c20dSSepherosa Ziehau 15651e3c20dSSepherosa Ziehau LIST_INSERT_HEAD(&lc->lro_active, le, next); 15705cde7efSSepherosa Ziehau LIST_INSERT_HEAD(bucket, le, hash_next); 15851e3c20dSSepherosa Ziehau } 15951e3c20dSSepherosa Ziehau 16051e3c20dSSepherosa Ziehau static __inline void 16151e3c20dSSepherosa Ziehau tcp_lro_active_remove(struct lro_entry *le) 16251e3c20dSSepherosa Ziehau { 16351e3c20dSSepherosa Ziehau 16405cde7efSSepherosa Ziehau LIST_REMOVE(le, next); /* active list */ 16505cde7efSSepherosa Ziehau LIST_REMOVE(le, hash_next); /* hash bucket */ 16651e3c20dSSepherosa Ziehau } 16751e3c20dSSepherosa Ziehau 1686c5087a8SJack F Vogel int 16962b5b6ecSBjoern A. Zeeb tcp_lro_init(struct lro_ctrl *lc) 1706c5087a8SJack F Vogel { 1718452c1b3SSepherosa Ziehau return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); 172e936121dSHans Petter Selasky } 173e936121dSHans Petter Selasky 174e936121dSHans Petter Selasky int 175e936121dSHans Petter Selasky tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, 176e936121dSHans Petter Selasky unsigned lro_entries, unsigned lro_mbufs) 177e936121dSHans Petter Selasky { 17862b5b6ecSBjoern A. Zeeb struct lro_entry *le; 179e936121dSHans Petter Selasky size_t size; 18005cde7efSSepherosa Ziehau unsigned i, elements; 1816c5087a8SJack F Vogel 18262b5b6ecSBjoern A. Zeeb lc->lro_bad_csum = 0; 18362b5b6ecSBjoern A. Zeeb lc->lro_queued = 0; 18462b5b6ecSBjoern A. Zeeb lc->lro_flushed = 0; 185e936121dSHans Petter Selasky lc->lro_mbuf_count = 0; 186e936121dSHans Petter Selasky lc->lro_mbuf_max = lro_mbufs; 187e936121dSHans Petter Selasky lc->lro_cnt = lro_entries; 1887ae3d4bfSSepherosa Ziehau lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; 1897ae3d4bfSSepherosa Ziehau lc->lro_length_lim = TCP_LRO_LENGTH_MAX; 190e936121dSHans Petter Selasky lc->ifp = ifp; 1911ea44822SSepherosa Ziehau LIST_INIT(&lc->lro_free); 1921ea44822SSepherosa Ziehau LIST_INIT(&lc->lro_active); 1936c5087a8SJack F Vogel 19405cde7efSSepherosa Ziehau /* create hash table to accelerate entry lookup */ 19505cde7efSSepherosa Ziehau if (lro_entries > lro_mbufs) 19605cde7efSSepherosa Ziehau elements = lro_entries; 19705cde7efSSepherosa Ziehau else 19805cde7efSSepherosa Ziehau elements = lro_mbufs; 19905cde7efSSepherosa Ziehau lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, 20005cde7efSSepherosa Ziehau HASH_NOWAIT); 20105cde7efSSepherosa Ziehau if (lc->lro_hash == NULL) { 20205cde7efSSepherosa Ziehau memset(lc, 0, sizeof(*lc)); 20305cde7efSSepherosa Ziehau return (ENOMEM); 20405cde7efSSepherosa Ziehau } 20505cde7efSSepherosa Ziehau 206e936121dSHans Petter Selasky /* compute size to allocate */ 207fc271df3SHans Petter Selasky size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + 208e936121dSHans Petter Selasky (lro_entries * sizeof(*le)); 209fc271df3SHans Petter Selasky lc->lro_mbuf_data = (struct lro_mbuf_sort *) 210e936121dSHans Petter Selasky malloc(size, M_LRO, M_NOWAIT | M_ZERO); 2116c5087a8SJack F Vogel 212e936121dSHans Petter Selasky /* check for out of memory */ 213e936121dSHans Petter Selasky if (lc->lro_mbuf_data == NULL) { 214a3927369SNavdeep Parhar free(lc->lro_hash, M_LRO); 215e936121dSHans Petter Selasky memset(lc, 0, sizeof(*lc)); 216e936121dSHans Petter Selasky return (ENOMEM); 217e936121dSHans Petter Selasky } 218e936121dSHans Petter Selasky /* compute offset for LRO entries */ 219e936121dSHans Petter Selasky le = (struct lro_entry *) 220e936121dSHans Petter Selasky (lc->lro_mbuf_data + lro_mbufs); 221e936121dSHans Petter Selasky 222e936121dSHans Petter Selasky /* setup linked list */ 223e936121dSHans Petter Selasky for (i = 0; i != lro_entries; i++) 2241ea44822SSepherosa Ziehau LIST_INSERT_HEAD(&lc->lro_free, le + i, next); 225e936121dSHans Petter Selasky 226e936121dSHans Petter Selasky return (0); 2276c5087a8SJack F Vogel } 2286c5087a8SJack F Vogel 229e57b2d0eSRandall Stewart static struct tcphdr * 230e57b2d0eSRandall Stewart tcp_lro_get_th(struct lro_entry *le, struct mbuf *m) 231e57b2d0eSRandall Stewart { 232e57b2d0eSRandall Stewart struct ether_header *eh; 233e57b2d0eSRandall Stewart struct tcphdr *th = NULL; 234e57b2d0eSRandall Stewart #ifdef INET6 235e57b2d0eSRandall Stewart struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 236e57b2d0eSRandall Stewart #endif 237e57b2d0eSRandall Stewart #ifdef INET 238e57b2d0eSRandall Stewart struct ip *ip4 = NULL; /* Keep compiler happy. */ 239e57b2d0eSRandall Stewart #endif 240e57b2d0eSRandall Stewart 241e57b2d0eSRandall Stewart eh = mtod(m, struct ether_header *); 242e57b2d0eSRandall Stewart switch (le->eh_type) { 243e57b2d0eSRandall Stewart #ifdef INET6 244e57b2d0eSRandall Stewart case ETHERTYPE_IPV6: 245e57b2d0eSRandall Stewart ip6 = (struct ip6_hdr *)(eh + 1); 246e57b2d0eSRandall Stewart th = (struct tcphdr *)(ip6 + 1); 247e57b2d0eSRandall Stewart break; 248e57b2d0eSRandall Stewart #endif 249e57b2d0eSRandall Stewart #ifdef INET 250e57b2d0eSRandall Stewart case ETHERTYPE_IP: 251e57b2d0eSRandall Stewart ip4 = (struct ip *)(eh + 1); 252e57b2d0eSRandall Stewart th = (struct tcphdr *)(ip4 + 1); 253e57b2d0eSRandall Stewart break; 254e57b2d0eSRandall Stewart #endif 255e57b2d0eSRandall Stewart } 256e57b2d0eSRandall Stewart return (th); 257e57b2d0eSRandall Stewart } 258e57b2d0eSRandall Stewart 259*69a34e8dSRandall Stewart static void 260*69a34e8dSRandall Stewart lro_free_mbuf_chain(struct mbuf *m) 261*69a34e8dSRandall Stewart { 262*69a34e8dSRandall Stewart struct mbuf *save; 263*69a34e8dSRandall Stewart 264*69a34e8dSRandall Stewart while (m) { 265*69a34e8dSRandall Stewart save = m->m_nextpkt; 266*69a34e8dSRandall Stewart m->m_nextpkt = NULL; 267*69a34e8dSRandall Stewart m_freem(m); 268*69a34e8dSRandall Stewart m = save; 269*69a34e8dSRandall Stewart } 270*69a34e8dSRandall Stewart } 271*69a34e8dSRandall Stewart 2726c5087a8SJack F Vogel void 27362b5b6ecSBjoern A. Zeeb tcp_lro_free(struct lro_ctrl *lc) 2746c5087a8SJack F Vogel { 27562b5b6ecSBjoern A. Zeeb struct lro_entry *le; 276e936121dSHans Petter Selasky unsigned x; 2776c5087a8SJack F Vogel 278e936121dSHans Petter Selasky /* reset LRO free list */ 2791ea44822SSepherosa Ziehau LIST_INIT(&lc->lro_free); 280e936121dSHans Petter Selasky 281e936121dSHans Petter Selasky /* free active mbufs, if any */ 2821ea44822SSepherosa Ziehau while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 28351e3c20dSSepherosa Ziehau tcp_lro_active_remove(le); 284*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 2856c5087a8SJack F Vogel } 286e936121dSHans Petter Selasky 28705cde7efSSepherosa Ziehau /* free hash table */ 28805cde7efSSepherosa Ziehau free(lc->lro_hash, M_LRO); 28905cde7efSSepherosa Ziehau lc->lro_hash = NULL; 29005cde7efSSepherosa Ziehau lc->lro_hashsz = 0; 29105cde7efSSepherosa Ziehau 292e936121dSHans Petter Selasky /* free mbuf array, if any */ 293e936121dSHans Petter Selasky for (x = 0; x != lc->lro_mbuf_count; x++) 294fc271df3SHans Petter Selasky m_freem(lc->lro_mbuf_data[x].mb); 295e936121dSHans Petter Selasky lc->lro_mbuf_count = 0; 296e936121dSHans Petter Selasky 297e936121dSHans Petter Selasky /* free allocated memory, if any */ 298e936121dSHans Petter Selasky free(lc->lro_mbuf_data, M_LRO); 299e936121dSHans Petter Selasky lc->lro_mbuf_data = NULL; 3006c5087a8SJack F Vogel } 3016c5087a8SJack F Vogel 30262b5b6ecSBjoern A. Zeeb static uint16_t 30362b5b6ecSBjoern A. Zeeb tcp_lro_csum_th(struct tcphdr *th) 30462b5b6ecSBjoern A. Zeeb { 30562b5b6ecSBjoern A. Zeeb uint32_t ch; 30662b5b6ecSBjoern A. Zeeb uint16_t *p, l; 30762b5b6ecSBjoern A. Zeeb 30862b5b6ecSBjoern A. Zeeb ch = th->th_sum = 0x0000; 30962b5b6ecSBjoern A. Zeeb l = th->th_off; 31062b5b6ecSBjoern A. Zeeb p = (uint16_t *)th; 31162b5b6ecSBjoern A. Zeeb while (l > 0) { 31262b5b6ecSBjoern A. Zeeb ch += *p; 31362b5b6ecSBjoern A. Zeeb p++; 31462b5b6ecSBjoern A. Zeeb ch += *p; 31562b5b6ecSBjoern A. Zeeb p++; 31662b5b6ecSBjoern A. Zeeb l--; 31762b5b6ecSBjoern A. Zeeb } 31862b5b6ecSBjoern A. Zeeb while (ch > 0xffff) 31962b5b6ecSBjoern A. Zeeb ch = (ch >> 16) + (ch & 0xffff); 32062b5b6ecSBjoern A. Zeeb 32162b5b6ecSBjoern A. Zeeb return (ch & 0xffff); 32262b5b6ecSBjoern A. Zeeb } 32362b5b6ecSBjoern A. Zeeb 32462b5b6ecSBjoern A. Zeeb static uint16_t 32562b5b6ecSBjoern A. Zeeb tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, 32662b5b6ecSBjoern A. Zeeb uint16_t tcp_data_len, uint16_t csum) 32762b5b6ecSBjoern A. Zeeb { 32862b5b6ecSBjoern A. Zeeb uint32_t c; 32962b5b6ecSBjoern A. Zeeb uint16_t cs; 33062b5b6ecSBjoern A. Zeeb 33162b5b6ecSBjoern A. Zeeb c = csum; 33262b5b6ecSBjoern A. Zeeb 33362b5b6ecSBjoern A. Zeeb /* Remove length from checksum. */ 33462b5b6ecSBjoern A. Zeeb switch (le->eh_type) { 33562b5b6ecSBjoern A. Zeeb #ifdef INET6 33662b5b6ecSBjoern A. Zeeb case ETHERTYPE_IPV6: 33762b5b6ecSBjoern A. Zeeb { 33862b5b6ecSBjoern A. Zeeb struct ip6_hdr *ip6; 33962b5b6ecSBjoern A. Zeeb 34062b5b6ecSBjoern A. Zeeb ip6 = (struct ip6_hdr *)l3hdr; 34162b5b6ecSBjoern A. Zeeb if (le->append_cnt == 0) 34262b5b6ecSBjoern A. Zeeb cs = ip6->ip6_plen; 34362b5b6ecSBjoern A. Zeeb else { 34462b5b6ecSBjoern A. Zeeb uint32_t cx; 34562b5b6ecSBjoern A. Zeeb 34662b5b6ecSBjoern A. Zeeb cx = ntohs(ip6->ip6_plen); 34762b5b6ecSBjoern A. Zeeb cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); 34862b5b6ecSBjoern A. Zeeb } 34962b5b6ecSBjoern A. Zeeb break; 35062b5b6ecSBjoern A. Zeeb } 35162b5b6ecSBjoern A. Zeeb #endif 35262b5b6ecSBjoern A. Zeeb #ifdef INET 35362b5b6ecSBjoern A. Zeeb case ETHERTYPE_IP: 35462b5b6ecSBjoern A. Zeeb { 35562b5b6ecSBjoern A. Zeeb struct ip *ip4; 35662b5b6ecSBjoern A. Zeeb 35762b5b6ecSBjoern A. Zeeb ip4 = (struct ip *)l3hdr; 35862b5b6ecSBjoern A. Zeeb if (le->append_cnt == 0) 35962b5b6ecSBjoern A. Zeeb cs = ip4->ip_len; 36062b5b6ecSBjoern A. Zeeb else { 36162b5b6ecSBjoern A. Zeeb cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), 36262b5b6ecSBjoern A. Zeeb IPPROTO_TCP); 36362b5b6ecSBjoern A. Zeeb cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, 36462b5b6ecSBjoern A. Zeeb htons(cs)); 36562b5b6ecSBjoern A. Zeeb } 36662b5b6ecSBjoern A. Zeeb break; 36762b5b6ecSBjoern A. Zeeb } 36862b5b6ecSBjoern A. Zeeb #endif 36962b5b6ecSBjoern A. Zeeb default: 37062b5b6ecSBjoern A. Zeeb cs = 0; /* Keep compiler happy. */ 37162b5b6ecSBjoern A. Zeeb } 37262b5b6ecSBjoern A. Zeeb 37362b5b6ecSBjoern A. Zeeb cs = ~cs; 37462b5b6ecSBjoern A. Zeeb c += cs; 37562b5b6ecSBjoern A. Zeeb 37662b5b6ecSBjoern A. Zeeb /* Remove TCP header csum. */ 37762b5b6ecSBjoern A. Zeeb cs = ~tcp_lro_csum_th(th); 37862b5b6ecSBjoern A. Zeeb c += cs; 37962b5b6ecSBjoern A. Zeeb while (c > 0xffff) 38062b5b6ecSBjoern A. Zeeb c = (c >> 16) + (c & 0xffff); 38162b5b6ecSBjoern A. Zeeb 38262b5b6ecSBjoern A. Zeeb return (c & 0xffff); 38362b5b6ecSBjoern A. Zeeb } 38462b5b6ecSBjoern A. Zeeb 3856dd38b87SSepherosa Ziehau static void 3866dd38b87SSepherosa Ziehau tcp_lro_rx_done(struct lro_ctrl *lc) 3876dd38b87SSepherosa Ziehau { 3886dd38b87SSepherosa Ziehau struct lro_entry *le; 3896dd38b87SSepherosa Ziehau 3901ea44822SSepherosa Ziehau while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 39151e3c20dSSepherosa Ziehau tcp_lro_active_remove(le); 3926dd38b87SSepherosa Ziehau tcp_lro_flush(lc, le); 3936dd38b87SSepherosa Ziehau } 3946dd38b87SSepherosa Ziehau } 3956dd38b87SSepherosa Ziehau 3966c5087a8SJack F Vogel void 3977127e6acSNavdeep Parhar tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 3987127e6acSNavdeep Parhar { 3997127e6acSNavdeep Parhar struct lro_entry *le, *le_tmp; 4007127e6acSNavdeep Parhar struct timeval tv; 4017127e6acSNavdeep Parhar 4021ea44822SSepherosa Ziehau if (LIST_EMPTY(&lc->lro_active)) 4037127e6acSNavdeep Parhar return; 4047127e6acSNavdeep Parhar 405e57b2d0eSRandall Stewart getmicrouptime(&tv); 4067127e6acSNavdeep Parhar timevalsub(&tv, timeout); 4071ea44822SSepherosa Ziehau LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 4087127e6acSNavdeep Parhar if (timevalcmp(&tv, &le->mtime, >=)) { 40951e3c20dSSepherosa Ziehau tcp_lro_active_remove(le); 4107127e6acSNavdeep Parhar tcp_lro_flush(lc, le); 4117127e6acSNavdeep Parhar } 4127127e6acSNavdeep Parhar } 4137127e6acSNavdeep Parhar } 4147127e6acSNavdeep Parhar 415e57b2d0eSRandall Stewart #ifdef INET6 416e57b2d0eSRandall Stewart static int 417e57b2d0eSRandall Stewart tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, 418e57b2d0eSRandall Stewart struct tcphdr **th) 4196c5087a8SJack F Vogel { 4206c5087a8SJack F Vogel 421e57b2d0eSRandall Stewart /* XXX-BZ we should check the flow-label. */ 422e57b2d0eSRandall Stewart 423e57b2d0eSRandall Stewart /* XXX-BZ We do not yet support ext. hdrs. */ 424e57b2d0eSRandall Stewart if (ip6->ip6_nxt != IPPROTO_TCP) 425e57b2d0eSRandall Stewart return (TCP_LRO_NOT_SUPPORTED); 426e57b2d0eSRandall Stewart 427e57b2d0eSRandall Stewart /* Find the TCP header. */ 428e57b2d0eSRandall Stewart *th = (struct tcphdr *)(ip6 + 1); 429e57b2d0eSRandall Stewart 430e57b2d0eSRandall Stewart return (0); 431e57b2d0eSRandall Stewart } 432e57b2d0eSRandall Stewart #endif 433e57b2d0eSRandall Stewart 434e57b2d0eSRandall Stewart #ifdef INET 435e57b2d0eSRandall Stewart static int 436e57b2d0eSRandall Stewart tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, 437e57b2d0eSRandall Stewart struct tcphdr **th) 438e57b2d0eSRandall Stewart { 439e57b2d0eSRandall Stewart int csum_flags; 440e57b2d0eSRandall Stewart uint16_t csum; 441e57b2d0eSRandall Stewart 442e57b2d0eSRandall Stewart if (ip4->ip_p != IPPROTO_TCP) 443e57b2d0eSRandall Stewart return (TCP_LRO_NOT_SUPPORTED); 444e57b2d0eSRandall Stewart 445e57b2d0eSRandall Stewart /* Ensure there are no options. */ 446e57b2d0eSRandall Stewart if ((ip4->ip_hl << 2) != sizeof (*ip4)) 447e57b2d0eSRandall Stewart return (TCP_LRO_CANNOT); 448e57b2d0eSRandall Stewart 449e57b2d0eSRandall Stewart /* .. and the packet is not fragmented. */ 450e57b2d0eSRandall Stewart if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 451e57b2d0eSRandall Stewart return (TCP_LRO_CANNOT); 452e57b2d0eSRandall Stewart 453e57b2d0eSRandall Stewart /* Legacy IP has a header checksum that needs to be correct. */ 454e57b2d0eSRandall Stewart csum_flags = m->m_pkthdr.csum_flags; 455e57b2d0eSRandall Stewart if (csum_flags & CSUM_IP_CHECKED) { 456e57b2d0eSRandall Stewart if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 457e57b2d0eSRandall Stewart lc->lro_bad_csum++; 458e57b2d0eSRandall Stewart return (TCP_LRO_CANNOT); 459e57b2d0eSRandall Stewart } 460e57b2d0eSRandall Stewart } else { 461e57b2d0eSRandall Stewart csum = in_cksum_hdr(ip4); 462e57b2d0eSRandall Stewart if (__predict_false((csum) != 0)) { 463e57b2d0eSRandall Stewart lc->lro_bad_csum++; 464e57b2d0eSRandall Stewart return (TCP_LRO_CANNOT); 465e57b2d0eSRandall Stewart } 466e57b2d0eSRandall Stewart } 467e57b2d0eSRandall Stewart /* Find the TCP header (we assured there are no IP options). */ 468e57b2d0eSRandall Stewart *th = (struct tcphdr *)(ip4 + 1); 469e57b2d0eSRandall Stewart return (0); 470e57b2d0eSRandall Stewart } 471e57b2d0eSRandall Stewart #endif 472e57b2d0eSRandall Stewart 473e57b2d0eSRandall Stewart static void 474e57b2d0eSRandall Stewart tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc, 475e57b2d0eSRandall Stewart struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len, 476e57b2d0eSRandall Stewart uint32_t th_seq , uint32_t th_ack, uint16_t th_win) 477e57b2d0eSRandall Stewart { 478e57b2d0eSRandall Stewart if (tp->t_logstate != TCP_LOG_STATE_OFF) { 479e57b2d0eSRandall Stewart union tcp_log_stackspecific log; 480e57b2d0eSRandall Stewart struct timeval tv; 481e57b2d0eSRandall Stewart uint32_t cts; 482e57b2d0eSRandall Stewart 483e57b2d0eSRandall Stewart cts = tcp_get_usecs(&tv); 484e57b2d0eSRandall Stewart memset(&log, 0, sizeof(union tcp_log_stackspecific)); 485e57b2d0eSRandall Stewart log.u_bbr.flex8 = frm; 486e57b2d0eSRandall Stewart log.u_bbr.flex1 = tcp_data_len; 487e57b2d0eSRandall Stewart if (m) 488e57b2d0eSRandall Stewart log.u_bbr.flex2 = m->m_pkthdr.len; 489e57b2d0eSRandall Stewart else 490e57b2d0eSRandall Stewart log.u_bbr.flex2 = 0; 491e57b2d0eSRandall Stewart log.u_bbr.flex3 = le->append_cnt; 492e57b2d0eSRandall Stewart log.u_bbr.flex4 = le->p_len; 493*69a34e8dSRandall Stewart if (le->m_head) { 494e57b2d0eSRandall Stewart log.u_bbr.flex5 = le->m_head->m_pkthdr.len; 495e57b2d0eSRandall Stewart log.u_bbr.delRate = le->m_head->m_flags; 496e57b2d0eSRandall Stewart log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; 497*69a34e8dSRandall Stewart } 498e57b2d0eSRandall Stewart log.u_bbr.inflight = th_seq; 499e57b2d0eSRandall Stewart log.u_bbr.timeStamp = cts; 500e57b2d0eSRandall Stewart log.u_bbr.epoch = le->next_seq; 501e57b2d0eSRandall Stewart log.u_bbr.delivered = th_ack; 502e57b2d0eSRandall Stewart log.u_bbr.lt_epoch = le->ack_seq; 503e57b2d0eSRandall Stewart log.u_bbr.pacing_gain = th_win; 504e57b2d0eSRandall Stewart log.u_bbr.cwnd_gain = le->window; 505b23b156eSWarner Losh log.u_bbr.cur_del_rate = (uintptr_t)m; 506b23b156eSWarner Losh log.u_bbr.bw_inuse = (uintptr_t)le->m_head; 507e57b2d0eSRandall Stewart log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */ 508e57b2d0eSRandall Stewart log.u_bbr.applimited = le->ulp_csum; 509e57b2d0eSRandall Stewart log.u_bbr.lost = le->mbuf_appended; 510*69a34e8dSRandall Stewart log.u_bbr.pkt_epoch = le->cmp_ack_cnt; 511*69a34e8dSRandall Stewart log.u_bbr.flex6 = tcp_tv_to_usectick(&lc->lro_last_flush); 512*69a34e8dSRandall Stewart if (in_epoch(net_epoch_preempt)) 513*69a34e8dSRandall Stewart log.u_bbr.inhpts = 1; 514*69a34e8dSRandall Stewart else 515*69a34e8dSRandall Stewart log.u_bbr.inhpts = 0; 516e57b2d0eSRandall Stewart TCP_LOG_EVENTP(tp, NULL, 517e57b2d0eSRandall Stewart &tp->t_inpcb->inp_socket->so_rcv, 518e57b2d0eSRandall Stewart &tp->t_inpcb->inp_socket->so_snd, 519e57b2d0eSRandall Stewart TCP_LOG_LRO, 0, 520e57b2d0eSRandall Stewart 0, &log, false, &tv); 521e57b2d0eSRandall Stewart } 522e57b2d0eSRandall Stewart } 523e57b2d0eSRandall Stewart 524e57b2d0eSRandall Stewart static void 525*69a34e8dSRandall Stewart tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) 526e57b2d0eSRandall Stewart { 527e57b2d0eSRandall Stewart if (le->append_cnt > 1) { 52862b5b6ecSBjoern A. Zeeb struct tcphdr *th; 52962b5b6ecSBjoern A. Zeeb uint16_t p_len; 5306c5087a8SJack F Vogel 53162b5b6ecSBjoern A. Zeeb p_len = htons(le->p_len); 53262b5b6ecSBjoern A. Zeeb switch (le->eh_type) { 53362b5b6ecSBjoern A. Zeeb #ifdef INET6 53462b5b6ecSBjoern A. Zeeb case ETHERTYPE_IPV6: 5356c5087a8SJack F Vogel { 53662b5b6ecSBjoern A. Zeeb struct ip6_hdr *ip6; 53762b5b6ecSBjoern A. Zeeb 53862b5b6ecSBjoern A. Zeeb ip6 = le->le_ip6; 53962b5b6ecSBjoern A. Zeeb ip6->ip6_plen = p_len; 54062b5b6ecSBjoern A. Zeeb th = (struct tcphdr *)(ip6 + 1); 54162b5b6ecSBjoern A. Zeeb le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 54262b5b6ecSBjoern A. Zeeb CSUM_PSEUDO_HDR; 54362b5b6ecSBjoern A. Zeeb le->p_len += ETHER_HDR_LEN + sizeof(*ip6); 54462b5b6ecSBjoern A. Zeeb break; 54562b5b6ecSBjoern A. Zeeb } 54662b5b6ecSBjoern A. Zeeb #endif 54762b5b6ecSBjoern A. Zeeb #ifdef INET 54862b5b6ecSBjoern A. Zeeb case ETHERTYPE_IP: 54962b5b6ecSBjoern A. Zeeb { 55062b5b6ecSBjoern A. Zeeb struct ip *ip4; 55162b5b6ecSBjoern A. Zeeb uint32_t cl; 55262b5b6ecSBjoern A. Zeeb uint16_t c; 55362b5b6ecSBjoern A. Zeeb 55462b5b6ecSBjoern A. Zeeb ip4 = le->le_ip4; 55562b5b6ecSBjoern A. Zeeb /* Fix IP header checksum for new length. */ 55662b5b6ecSBjoern A. Zeeb c = ~ip4->ip_sum; 55762b5b6ecSBjoern A. Zeeb cl = c; 55862b5b6ecSBjoern A. Zeeb c = ~ip4->ip_len; 55962b5b6ecSBjoern A. Zeeb cl += c + p_len; 56062b5b6ecSBjoern A. Zeeb while (cl > 0xffff) 56162b5b6ecSBjoern A. Zeeb cl = (cl >> 16) + (cl & 0xffff); 56262b5b6ecSBjoern A. Zeeb c = cl; 56362b5b6ecSBjoern A. Zeeb ip4->ip_sum = ~c; 56462b5b6ecSBjoern A. Zeeb ip4->ip_len = p_len; 56562b5b6ecSBjoern A. Zeeb th = (struct tcphdr *)(ip4 + 1); 56662b5b6ecSBjoern A. Zeeb le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 56762b5b6ecSBjoern A. Zeeb CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 56862b5b6ecSBjoern A. Zeeb le->p_len += ETHER_HDR_LEN; 56962b5b6ecSBjoern A. Zeeb break; 57062b5b6ecSBjoern A. Zeeb } 57162b5b6ecSBjoern A. Zeeb #endif 57262b5b6ecSBjoern A. Zeeb default: 57362b5b6ecSBjoern A. Zeeb th = NULL; /* Keep compiler happy. */ 57462b5b6ecSBjoern A. Zeeb } 57562b5b6ecSBjoern A. Zeeb le->m_head->m_pkthdr.csum_data = 0xffff; 57662b5b6ecSBjoern A. Zeeb le->m_head->m_pkthdr.len = le->p_len; 57762b5b6ecSBjoern A. Zeeb 57862b5b6ecSBjoern A. Zeeb /* Incorporate the latest ACK into the TCP header. */ 57962b5b6ecSBjoern A. Zeeb th->th_ack = le->ack_seq; 58062b5b6ecSBjoern A. Zeeb th->th_win = le->window; 58162b5b6ecSBjoern A. Zeeb /* Incorporate latest timestamp into the TCP header. */ 58262b5b6ecSBjoern A. Zeeb if (le->timestamp != 0) { 5836c5087a8SJack F Vogel uint32_t *ts_ptr; 5846c5087a8SJack F Vogel 58562b5b6ecSBjoern A. Zeeb ts_ptr = (uint32_t *)(th + 1); 58662b5b6ecSBjoern A. Zeeb ts_ptr[1] = htonl(le->tsval); 58762b5b6ecSBjoern A. Zeeb ts_ptr[2] = le->tsecr; 58862b5b6ecSBjoern A. Zeeb } 58962b5b6ecSBjoern A. Zeeb /* Update the TCP header checksum. */ 59062b5b6ecSBjoern A. Zeeb le->ulp_csum += p_len; 59162b5b6ecSBjoern A. Zeeb le->ulp_csum += tcp_lro_csum_th(th); 59262b5b6ecSBjoern A. Zeeb while (le->ulp_csum > 0xffff) 59362b5b6ecSBjoern A. Zeeb le->ulp_csum = (le->ulp_csum >> 16) + 59462b5b6ecSBjoern A. Zeeb (le->ulp_csum & 0xffff); 59562b5b6ecSBjoern A. Zeeb th->th_sum = (le->ulp_csum & 0xffff); 59662b5b6ecSBjoern A. Zeeb th->th_sum = ~th->th_sum; 597e57b2d0eSRandall Stewart } 598e57b2d0eSRandall Stewart /* 599e57b2d0eSRandall Stewart * Break any chain, this is not set to NULL on the singleton 600e57b2d0eSRandall Stewart * case m_nextpkt points to m_head. Other case set them 601e57b2d0eSRandall Stewart * m_nextpkt to NULL in push_and_replace. 602e57b2d0eSRandall Stewart */ 603e57b2d0eSRandall Stewart le->m_head->m_nextpkt = NULL; 604e57b2d0eSRandall Stewart le->m_head->m_pkthdr.lro_nsegs = le->append_cnt; 605e57b2d0eSRandall Stewart (*lc->ifp->if_input)(lc->ifp, le->m_head); 606e57b2d0eSRandall Stewart lc->lro_queued += le->append_cnt; 60762b5b6ecSBjoern A. Zeeb } 6086c5087a8SJack F Vogel 609e57b2d0eSRandall Stewart static void 610e57b2d0eSRandall Stewart tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 611e57b2d0eSRandall Stewart { 612e57b2d0eSRandall Stewart struct ether_header *eh; 613e57b2d0eSRandall Stewart void *l3hdr = NULL; /* Keep compiler happy. */ 614e57b2d0eSRandall Stewart struct tcphdr *th; 615e57b2d0eSRandall Stewart #ifdef INET6 616e57b2d0eSRandall Stewart struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 617e57b2d0eSRandall Stewart #endif 618e57b2d0eSRandall Stewart #ifdef INET 619e57b2d0eSRandall Stewart struct ip *ip4 = NULL; /* Keep compiler happy. */ 620e57b2d0eSRandall Stewart #endif 621e57b2d0eSRandall Stewart uint32_t *ts_ptr; 622e57b2d0eSRandall Stewart int error, l, ts_failed = 0; 623e57b2d0eSRandall Stewart uint16_t tcp_data_len; 624e57b2d0eSRandall Stewart uint16_t csum; 625e57b2d0eSRandall Stewart 626e57b2d0eSRandall Stewart error = -1; 627e57b2d0eSRandall Stewart eh = mtod(m, struct ether_header *); 628e57b2d0eSRandall Stewart /* 629e57b2d0eSRandall Stewart * We must reset the other pointers since the mbuf 630e57b2d0eSRandall Stewart * we were pointing too is about to go away. 631e57b2d0eSRandall Stewart */ 632e57b2d0eSRandall Stewart switch (le->eh_type) { 633e57b2d0eSRandall Stewart #ifdef INET6 634e57b2d0eSRandall Stewart case ETHERTYPE_IPV6: 635e57b2d0eSRandall Stewart l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 636e57b2d0eSRandall Stewart error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 637e57b2d0eSRandall Stewart le->le_ip6 = ip6; 638e57b2d0eSRandall Stewart le->source_ip6 = ip6->ip6_src; 639e57b2d0eSRandall Stewart le->dest_ip6 = ip6->ip6_dst; 640e57b2d0eSRandall Stewart le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 641e57b2d0eSRandall Stewart break; 642e57b2d0eSRandall Stewart #endif 643e57b2d0eSRandall Stewart #ifdef INET 644e57b2d0eSRandall Stewart case ETHERTYPE_IP: 645e57b2d0eSRandall Stewart l3hdr = ip4 = (struct ip *)(eh + 1); 646e57b2d0eSRandall Stewart error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 647e57b2d0eSRandall Stewart le->le_ip4 = ip4; 648e57b2d0eSRandall Stewart le->source_ip4 = ip4->ip_src.s_addr; 649e57b2d0eSRandall Stewart le->dest_ip4 = ip4->ip_dst.s_addr; 650e57b2d0eSRandall Stewart le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 651e57b2d0eSRandall Stewart break; 652e57b2d0eSRandall Stewart #endif 653e57b2d0eSRandall Stewart } 654e57b2d0eSRandall Stewart KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n", 655e57b2d0eSRandall Stewart __func__, le)); 656e57b2d0eSRandall Stewart ts_ptr = (uint32_t *)(th + 1); 657e57b2d0eSRandall Stewart l = (th->th_off << 2); 658e57b2d0eSRandall Stewart l -= sizeof(*th); 659e57b2d0eSRandall Stewart if (l != 0 && 660e57b2d0eSRandall Stewart (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 661e57b2d0eSRandall Stewart (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 662e57b2d0eSRandall Stewart TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 663e57b2d0eSRandall Stewart /* We have failed to find a timestamp some other option? */ 664e57b2d0eSRandall Stewart ts_failed = 1; 665e57b2d0eSRandall Stewart } 666e57b2d0eSRandall Stewart if ((l != 0) && (ts_failed == 0)) { 667e57b2d0eSRandall Stewart le->timestamp = 1; 668e57b2d0eSRandall Stewart le->tsval = ntohl(*(ts_ptr + 1)); 669e57b2d0eSRandall Stewart le->tsecr = *(ts_ptr + 2); 670e57b2d0eSRandall Stewart } else 671e57b2d0eSRandall Stewart le->timestamp = 0; 672e57b2d0eSRandall Stewart le->source_port = th->th_sport; 673e57b2d0eSRandall Stewart le->dest_port = th->th_dport; 674e57b2d0eSRandall Stewart /* Pull out the csum */ 675e57b2d0eSRandall Stewart tcp_data_len = m->m_pkthdr.lro_len; 676e57b2d0eSRandall Stewart le->next_seq = ntohl(th->th_seq) + tcp_data_len; 677e57b2d0eSRandall Stewart le->ack_seq = th->th_ack; 678e57b2d0eSRandall Stewart le->window = th->th_win; 679e57b2d0eSRandall Stewart csum = th->th_sum; 680e57b2d0eSRandall Stewart /* Setup the data pointers */ 681e57b2d0eSRandall Stewart le->m_head = m; 682e57b2d0eSRandall Stewart le->m_tail = m_last(m); 683e57b2d0eSRandall Stewart le->append_cnt = 0; 684e57b2d0eSRandall Stewart le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 685e57b2d0eSRandall Stewart ~csum); 686e57b2d0eSRandall Stewart le->append_cnt++; 687e57b2d0eSRandall Stewart th->th_sum = csum; /* Restore checksum on first packet. */ 688e57b2d0eSRandall Stewart } 689e57b2d0eSRandall Stewart 690e57b2d0eSRandall Stewart static void 691*69a34e8dSRandall Stewart tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 692e57b2d0eSRandall Stewart { 693e57b2d0eSRandall Stewart /* 694e57b2d0eSRandall Stewart * Push up the stack the current le and replace 695e57b2d0eSRandall Stewart * it with m. 696e57b2d0eSRandall Stewart */ 697e57b2d0eSRandall Stewart struct mbuf *msave; 698e57b2d0eSRandall Stewart 699e57b2d0eSRandall Stewart /* Grab off the next and save it */ 700e57b2d0eSRandall Stewart msave = le->m_head->m_nextpkt; 701e57b2d0eSRandall Stewart le->m_head->m_nextpkt = NULL; 702e57b2d0eSRandall Stewart /* Now push out the old le entry */ 703*69a34e8dSRandall Stewart tcp_flush_out_le(tp, lc, le); 704e57b2d0eSRandall Stewart /* 705e57b2d0eSRandall Stewart * Now to replace the data properly in the le 706e57b2d0eSRandall Stewart * we have to reset the tcp header and 707e57b2d0eSRandall Stewart * other fields. 708e57b2d0eSRandall Stewart */ 709e57b2d0eSRandall Stewart tcp_set_le_to_m(lc, le, m); 710e57b2d0eSRandall Stewart /* Restore the next list */ 711e57b2d0eSRandall Stewart m->m_nextpkt = msave; 712e57b2d0eSRandall Stewart } 713e57b2d0eSRandall Stewart 714e57b2d0eSRandall Stewart static void 715*69a34e8dSRandall Stewart tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) 716e57b2d0eSRandall Stewart { 717e57b2d0eSRandall Stewart /* 718e57b2d0eSRandall Stewart * Walk through the mbuf chain we 719e57b2d0eSRandall Stewart * have on tap and compress/condense 720e57b2d0eSRandall Stewart * as required. 721e57b2d0eSRandall Stewart */ 722e57b2d0eSRandall Stewart uint32_t *ts_ptr; 723e57b2d0eSRandall Stewart struct mbuf *m; 724e57b2d0eSRandall Stewart struct tcphdr *th; 725e57b2d0eSRandall Stewart uint16_t tcp_data_len, csum_upd; 726e57b2d0eSRandall Stewart int l; 727e57b2d0eSRandall Stewart 728e57b2d0eSRandall Stewart /* 729e57b2d0eSRandall Stewart * First we must check the lead (m_head) 730e57b2d0eSRandall Stewart * we must make sure that it is *not* 731e57b2d0eSRandall Stewart * something that should be sent up 732e57b2d0eSRandall Stewart * right away (sack etc). 733e57b2d0eSRandall Stewart */ 734e57b2d0eSRandall Stewart again: 735e57b2d0eSRandall Stewart 736e57b2d0eSRandall Stewart m = le->m_head->m_nextpkt; 737e57b2d0eSRandall Stewart if (m == NULL) { 738e57b2d0eSRandall Stewart /* Just the one left */ 739e57b2d0eSRandall Stewart return; 740e57b2d0eSRandall Stewart } 741*69a34e8dSRandall Stewart if (m->m_flags & M_ACKCMP) 742*69a34e8dSRandall Stewart panic("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", 743*69a34e8dSRandall Stewart lc, le, m); 744e57b2d0eSRandall Stewart th = tcp_lro_get_th(le, le->m_head); 745e57b2d0eSRandall Stewart KASSERT(th != NULL, 746e57b2d0eSRandall Stewart ("le:%p m:%p th comes back NULL?", le, le->m_head)); 747e57b2d0eSRandall Stewart l = (th->th_off << 2); 748e57b2d0eSRandall Stewart l -= sizeof(*th); 749e57b2d0eSRandall Stewart ts_ptr = (uint32_t *)(th + 1); 750e57b2d0eSRandall Stewart if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 751e57b2d0eSRandall Stewart (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 752e57b2d0eSRandall Stewart TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 753e57b2d0eSRandall Stewart /* 754e57b2d0eSRandall Stewart * Its not the timestamp. We can't 755e57b2d0eSRandall Stewart * use this guy as the head. 756e57b2d0eSRandall Stewart */ 757e57b2d0eSRandall Stewart le->m_head->m_nextpkt = m->m_nextpkt; 758*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 759e57b2d0eSRandall Stewart goto again; 760e57b2d0eSRandall Stewart } 761e57b2d0eSRandall Stewart if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 762e57b2d0eSRandall Stewart /* 763e57b2d0eSRandall Stewart * Make sure that previously seen segements/ACKs are delivered 764e57b2d0eSRandall Stewart * before this segment, e.g. FIN. 765e57b2d0eSRandall Stewart */ 766e57b2d0eSRandall Stewart le->m_head->m_nextpkt = m->m_nextpkt; 767*69a34e8dSRandall Stewart KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , 768*69a34e8dSRandall Stewart ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); 769*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 770e57b2d0eSRandall Stewart goto again; 771e57b2d0eSRandall Stewart } 772e57b2d0eSRandall Stewart while((m = le->m_head->m_nextpkt) != NULL) { 773e57b2d0eSRandall Stewart /* 774e57b2d0eSRandall Stewart * condense m into le, first 775e57b2d0eSRandall Stewart * pull m out of the list. 776e57b2d0eSRandall Stewart */ 777*69a34e8dSRandall Stewart KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , 778*69a34e8dSRandall Stewart ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); 779*69a34e8dSRandall Stewart KASSERT(((m->m_flags & M_ACKCMP) == 0), 780*69a34e8dSRandall Stewart ("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", lc, le, m)); 781e57b2d0eSRandall Stewart le->m_head->m_nextpkt = m->m_nextpkt; 782e57b2d0eSRandall Stewart m->m_nextpkt = NULL; 783e57b2d0eSRandall Stewart /* Setup my data */ 784e57b2d0eSRandall Stewart tcp_data_len = m->m_pkthdr.lro_len; 785e57b2d0eSRandall Stewart th = tcp_lro_get_th(le, m); 786e57b2d0eSRandall Stewart KASSERT(th != NULL, 787e57b2d0eSRandall Stewart ("le:%p m:%p th comes back NULL?", le, m)); 788e57b2d0eSRandall Stewart ts_ptr = (uint32_t *)(th + 1); 789e57b2d0eSRandall Stewart l = (th->th_off << 2); 790e57b2d0eSRandall Stewart l -= sizeof(*th); 791e57b2d0eSRandall Stewart if (le->append_cnt >= lc->lro_ackcnt_lim) { 792*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 793e57b2d0eSRandall Stewart goto again; 794e57b2d0eSRandall Stewart } 795e57b2d0eSRandall Stewart if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { 796e57b2d0eSRandall Stewart /* Flush now if appending will result in overflow. */ 797*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 798e57b2d0eSRandall Stewart goto again; 799e57b2d0eSRandall Stewart } 800e57b2d0eSRandall Stewart if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 801e57b2d0eSRandall Stewart (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 802e57b2d0eSRandall Stewart TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 803e57b2d0eSRandall Stewart /* 804e57b2d0eSRandall Stewart * Maybe a sack in the new one? We need to 805e57b2d0eSRandall Stewart * start all over after flushing the 806e57b2d0eSRandall Stewart * current le. We will go up to the beginning 807e57b2d0eSRandall Stewart * and flush it (calling the replace again possibly 808e57b2d0eSRandall Stewart * or just returning). 809e57b2d0eSRandall Stewart */ 810*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 811e57b2d0eSRandall Stewart goto again; 812e57b2d0eSRandall Stewart } 813e57b2d0eSRandall Stewart if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 814*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 815e57b2d0eSRandall Stewart goto again; 816e57b2d0eSRandall Stewart } 817e57b2d0eSRandall Stewart if (l != 0) { 818e57b2d0eSRandall Stewart uint32_t tsval = ntohl(*(ts_ptr + 1)); 819e57b2d0eSRandall Stewart /* Make sure timestamp values are increasing. */ 820e57b2d0eSRandall Stewart if (TSTMP_GT(le->tsval, tsval)) { 821*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 822e57b2d0eSRandall Stewart goto again; 823e57b2d0eSRandall Stewart } 824e57b2d0eSRandall Stewart le->tsval = tsval; 825e57b2d0eSRandall Stewart le->tsecr = *(ts_ptr + 2); 826e57b2d0eSRandall Stewart } 827e57b2d0eSRandall Stewart /* Try to append the new segment. */ 828e57b2d0eSRandall Stewart if (__predict_false(ntohl(th->th_seq) != le->next_seq || 829e57b2d0eSRandall Stewart (tcp_data_len == 0 && 830e57b2d0eSRandall Stewart le->ack_seq == th->th_ack && 831e57b2d0eSRandall Stewart le->window == th->th_win))) { 832e57b2d0eSRandall Stewart /* Out of order packet or duplicate ACK. */ 833*69a34e8dSRandall Stewart tcp_push_and_replace(tp, lc, le, m); 834e57b2d0eSRandall Stewart goto again; 835e57b2d0eSRandall Stewart } 836e57b2d0eSRandall Stewart if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 837e57b2d0eSRandall Stewart le->next_seq += tcp_data_len; 838e57b2d0eSRandall Stewart le->ack_seq = th->th_ack; 839e57b2d0eSRandall Stewart le->window = th->th_win; 840e57b2d0eSRandall Stewart } else if (th->th_ack == le->ack_seq) { 841e57b2d0eSRandall Stewart le->window = WIN_MAX(le->window, th->th_win); 842e57b2d0eSRandall Stewart } 843e57b2d0eSRandall Stewart csum_upd = m->m_pkthdr.lro_csum; 844e57b2d0eSRandall Stewart le->ulp_csum += csum_upd; 845e57b2d0eSRandall Stewart if (tcp_data_len == 0) { 846e57b2d0eSRandall Stewart le->append_cnt++; 847e57b2d0eSRandall Stewart le->mbuf_cnt--; 848e57b2d0eSRandall Stewart m_freem(m); 849e57b2d0eSRandall Stewart continue; 850e57b2d0eSRandall Stewart } 851e57b2d0eSRandall Stewart le->append_cnt++; 852e57b2d0eSRandall Stewart le->mbuf_appended++; 853e57b2d0eSRandall Stewart le->p_len += tcp_data_len; 854e57b2d0eSRandall Stewart /* 855e57b2d0eSRandall Stewart * Adjust the mbuf so that m_data points to the first byte of 856e57b2d0eSRandall Stewart * the ULP payload. Adjust the mbuf to avoid complications and 857e57b2d0eSRandall Stewart * append new segment to existing mbuf chain. 858e57b2d0eSRandall Stewart */ 859e57b2d0eSRandall Stewart m_adj(m, m->m_pkthdr.len - tcp_data_len); 860e57b2d0eSRandall Stewart m_demote_pkthdr(m); 861e57b2d0eSRandall Stewart le->m_tail->m_next = m; 862e57b2d0eSRandall Stewart le->m_tail = m_last(m); 863e57b2d0eSRandall Stewart } 864e57b2d0eSRandall Stewart } 865e57b2d0eSRandall Stewart 866373013b0SConrad Meyer #ifdef TCPHPTS 867e57b2d0eSRandall Stewart static void 868e57b2d0eSRandall Stewart tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) 869e57b2d0eSRandall Stewart { 870e57b2d0eSRandall Stewart if (tp->t_in_pkt == NULL) { 871e57b2d0eSRandall Stewart /* Nothing yet there */ 872e57b2d0eSRandall Stewart tp->t_in_pkt = le->m_head; 873e57b2d0eSRandall Stewart tp->t_tail_pkt = le->m_last_mbuf; 874e57b2d0eSRandall Stewart } else { 875e57b2d0eSRandall Stewart /* Already some there */ 876e57b2d0eSRandall Stewart tp->t_tail_pkt->m_nextpkt = le->m_head; 877e57b2d0eSRandall Stewart tp->t_tail_pkt = le->m_last_mbuf; 878e57b2d0eSRandall Stewart } 879e57b2d0eSRandall Stewart le->m_head = NULL; 880e57b2d0eSRandall Stewart le->m_last_mbuf = NULL; 881e57b2d0eSRandall Stewart } 882e57b2d0eSRandall Stewart 883*69a34e8dSRandall Stewart static struct mbuf * 884*69a34e8dSRandall Stewart tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, struct inpcb *inp) 885e57b2d0eSRandall Stewart { 886*69a34e8dSRandall Stewart struct mbuf *m = NULL; 887*69a34e8dSRandall Stewart struct tcpcb *tp; 888e57b2d0eSRandall Stewart 889*69a34e8dSRandall Stewart tp = intotcpcb(inp); 890*69a34e8dSRandall Stewart if (tp) { 891*69a34e8dSRandall Stewart /* Look at the last mbuf if any in queue */ 892*69a34e8dSRandall Stewart if ((tp->t_tail_pkt) && 893*69a34e8dSRandall Stewart (tp->t_tail_pkt->m_flags & M_ACKCMP)) { 894*69a34e8dSRandall Stewart if (M_TRAILINGSPACE(tp->t_tail_pkt) >= sizeof(struct tcp_ackent)) { 895*69a34e8dSRandall Stewart tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); 896*69a34e8dSRandall Stewart m = tp->t_tail_pkt; 897*69a34e8dSRandall Stewart } else { 898*69a34e8dSRandall Stewart if ((inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { 899*69a34e8dSRandall Stewart counter_u64_add(tcp_would_have_but, 1); 900*69a34e8dSRandall Stewart inp->inp_flags2 |= INP_MBUF_L_ACKS; 901*69a34e8dSRandall Stewart } 902*69a34e8dSRandall Stewart } 903*69a34e8dSRandall Stewart } 904*69a34e8dSRandall Stewart } 905*69a34e8dSRandall Stewart return (m); 906*69a34e8dSRandall Stewart } 907*69a34e8dSRandall Stewart 908*69a34e8dSRandall Stewart static struct inpcb * 909*69a34e8dSRandall Stewart tcp_lro_lookup(struct lro_ctrl *lc, struct lro_entry *le) 910*69a34e8dSRandall Stewart { 911*69a34e8dSRandall Stewart struct inpcb *inp = NULL; 912*69a34e8dSRandall Stewart 913*69a34e8dSRandall Stewart NET_EPOCH_ASSERT(); 914e57b2d0eSRandall Stewart switch (le->eh_type) { 915e57b2d0eSRandall Stewart #ifdef INET6 916e57b2d0eSRandall Stewart case ETHERTYPE_IPV6: 917e57b2d0eSRandall Stewart inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6, 918e57b2d0eSRandall Stewart le->source_port, &le->dest_ip6,le->dest_port, 919e57b2d0eSRandall Stewart INPLOOKUP_WLOCKPCB, 920e57b2d0eSRandall Stewart lc->ifp); 921e57b2d0eSRandall Stewart break; 922e57b2d0eSRandall Stewart #endif 923e57b2d0eSRandall Stewart #ifdef INET 924e57b2d0eSRandall Stewart case ETHERTYPE_IP: 925e57b2d0eSRandall Stewart inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src, 926e57b2d0eSRandall Stewart le->source_port, le->le_ip4->ip_dst, le->dest_port, 927e57b2d0eSRandall Stewart INPLOOKUP_WLOCKPCB, 928e57b2d0eSRandall Stewart lc->ifp); 929e57b2d0eSRandall Stewart break; 930e57b2d0eSRandall Stewart #endif 931e57b2d0eSRandall Stewart } 932*69a34e8dSRandall Stewart return (inp); 933*69a34e8dSRandall Stewart } 934*69a34e8dSRandall Stewart 935*69a34e8dSRandall Stewart #endif 936*69a34e8dSRandall Stewart #ifdef NO 937*69a34e8dSRandall Stewart static void 938*69a34e8dSRandall Stewart stack_guard_prep(uint32_t *sg, int len) 939*69a34e8dSRandall Stewart { 940*69a34e8dSRandall Stewart int i; 941*69a34e8dSRandall Stewart 942*69a34e8dSRandall Stewart for (i = 0; i < len; i++) { 943*69a34e8dSRandall Stewart sg[i] = 0xdeadc0de; 944*69a34e8dSRandall Stewart } 945*69a34e8dSRandall Stewart } 946*69a34e8dSRandall Stewart 947*69a34e8dSRandall Stewart static void 948*69a34e8dSRandall Stewart stack_guard_check(struct lro_ctrl *lc, struct lro_entry *le, uint32_t *sg, int len) 949*69a34e8dSRandall Stewart { 950*69a34e8dSRandall Stewart int i; 951*69a34e8dSRandall Stewart 952*69a34e8dSRandall Stewart for (i = 0; i < len; i++) { 953*69a34e8dSRandall Stewart if (sg[i] != 0xdeadc0de) 954*69a34e8dSRandall Stewart panic("Stack guard fails sg[%d] = 0x%x le:%p lc:%p sg:%p\n", 955*69a34e8dSRandall Stewart i, sg[i], le, lc, sg); 956*69a34e8dSRandall Stewart } 957*69a34e8dSRandall Stewart } 958*69a34e8dSRandall Stewart #endif 959*69a34e8dSRandall Stewart 960*69a34e8dSRandall Stewart void 961*69a34e8dSRandall Stewart tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 962*69a34e8dSRandall Stewart { 963*69a34e8dSRandall Stewart struct tcpcb *tp = NULL; 964*69a34e8dSRandall Stewart #ifdef TCPHPTS 965*69a34e8dSRandall Stewart struct inpcb *inp = NULL; 966*69a34e8dSRandall Stewart int need_wakeup = 0, can_queue = 0; 967*69a34e8dSRandall Stewart 968*69a34e8dSRandall Stewart /* Now lets lookup the inp first */ 969*69a34e8dSRandall Stewart CURVNET_SET(lc->ifp->if_vnet); 970*69a34e8dSRandall Stewart /* 971*69a34e8dSRandall Stewart * XXXRRS Currently the common input handler for 972*69a34e8dSRandall Stewart * mbuf queuing cannot handle VLAN Tagged. This needs 973*69a34e8dSRandall Stewart * to be fixed and the or condition removed (i.e. the 974*69a34e8dSRandall Stewart * common code should do the right lookup for the vlan 975*69a34e8dSRandall Stewart * tag and anything else that the vlan_input() does). 976*69a34e8dSRandall Stewart */ 977*69a34e8dSRandall Stewart if (le->m_head == NULL) { 978*69a34e8dSRandall Stewart /* 979*69a34e8dSRandall Stewart * Everything was pushed up to the stack nothing to do 980*69a34e8dSRandall Stewart * but release the reference and be done. 981*69a34e8dSRandall Stewart */ 982*69a34e8dSRandall Stewart if (le->inp) { 983*69a34e8dSRandall Stewart INP_WLOCK(le->inp); 984*69a34e8dSRandall Stewart if (in_pcbrele_wlocked(le->inp) == 0) { 985*69a34e8dSRandall Stewart /* 986*69a34e8dSRandall Stewart * We released it and still 987*69a34e8dSRandall Stewart * have the lock. 988*69a34e8dSRandall Stewart */ 989*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 990*69a34e8dSRandall Stewart } 991*69a34e8dSRandall Stewart } 992*69a34e8dSRandall Stewart goto done; 993*69a34e8dSRandall Stewart } 994*69a34e8dSRandall Stewart if ((tcplro_stacks_wanting_mbufq == 0) || (le->m_head->m_flags & M_VLANTAG)) 995*69a34e8dSRandall Stewart goto skip_lookup; 996*69a34e8dSRandall Stewart 997*69a34e8dSRandall Stewart if (le->inp == NULL) { 998*69a34e8dSRandall Stewart le->inp = inp = tcp_lro_lookup(lc, le); 999e57b2d0eSRandall Stewart if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1000e57b2d0eSRandall Stewart (inp->inp_flags2 & INP_FREED))) { 1001*69a34e8dSRandall Stewart /* 1002*69a34e8dSRandall Stewart * We can't present these to the inp since 1003*69a34e8dSRandall Stewart * it will not support the stripped ethernet 1004*69a34e8dSRandall Stewart * header that these have nor if a compressed 1005*69a34e8dSRandall Stewart * ack is presnet. 1006*69a34e8dSRandall Stewart */ 1007e57b2d0eSRandall Stewart INP_WUNLOCK(inp); 1008*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1009*69a34e8dSRandall Stewart goto done; 1010e57b2d0eSRandall Stewart } 1011*69a34e8dSRandall Stewart if ((le->flags & HAS_COMP_ENTRIES) && 1012*69a34e8dSRandall Stewart ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { 1013*69a34e8dSRandall Stewart /* 1014*69a34e8dSRandall Stewart * It swapped to off, must be a stack 1015*69a34e8dSRandall Stewart * switch. We need to ditch all the packets 1016*69a34e8dSRandall Stewart * and the peer will just have to retransmit. 1017*69a34e8dSRandall Stewart */ 1018*69a34e8dSRandall Stewart INP_WUNLOCK(inp); 1019*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1020*69a34e8dSRandall Stewart goto done; 1021*69a34e8dSRandall Stewart } 1022*69a34e8dSRandall Stewart } else { 1023*69a34e8dSRandall Stewart /* We have a reference on the inp lets lock and release it */ 1024*69a34e8dSRandall Stewart inp = le->inp; 1025*69a34e8dSRandall Stewart INP_WLOCK(inp); 1026*69a34e8dSRandall Stewart if (in_pcbrele_wlocked(inp)) { 1027*69a34e8dSRandall Stewart /* 1028*69a34e8dSRandall Stewart * We lost the inp. We can't present these to the inp since 1029*69a34e8dSRandall Stewart * it will not support the stripped off etherent header. 1030*69a34e8dSRandall Stewart */ 1031*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1032*69a34e8dSRandall Stewart goto done; 1033*69a34e8dSRandall Stewart } 1034*69a34e8dSRandall Stewart if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1035*69a34e8dSRandall Stewart (inp->inp_flags2 & INP_FREED))) { 1036*69a34e8dSRandall Stewart /* 1037*69a34e8dSRandall Stewart * We can't present these to the inp since 1038*69a34e8dSRandall Stewart * it may not support them. 1039*69a34e8dSRandall Stewart */ 1040*69a34e8dSRandall Stewart INP_WUNLOCK(inp); 1041*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1042*69a34e8dSRandall Stewart goto done; 1043*69a34e8dSRandall Stewart } 1044*69a34e8dSRandall Stewart if ((le->flags & HAS_COMP_ENTRIES) && 1045*69a34e8dSRandall Stewart ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { 1046*69a34e8dSRandall Stewart /* 1047*69a34e8dSRandall Stewart * It swapped to off, must be a stack 1048*69a34e8dSRandall Stewart * switch. We need to ditch all the packets 1049*69a34e8dSRandall Stewart * and the peer will just have to retransmit. 1050*69a34e8dSRandall Stewart */ 1051*69a34e8dSRandall Stewart INP_WUNLOCK(inp); 1052*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1053*69a34e8dSRandall Stewart goto done; 1054*69a34e8dSRandall Stewart } 1055*69a34e8dSRandall Stewart } 1056*69a34e8dSRandall Stewart if (inp && ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) || 1057*69a34e8dSRandall Stewart (inp->inp_flags2 & INP_MBUF_ACKCMP))) { 1058e57b2d0eSRandall Stewart /* The transport supports mbuf queuing */ 1059e57b2d0eSRandall Stewart can_queue = 1; 1060e57b2d0eSRandall Stewart if (le->need_wakeup || 1061e57b2d0eSRandall Stewart ((inp->inp_in_input == 0) && 1062e57b2d0eSRandall Stewart ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) { 1063e57b2d0eSRandall Stewart /* 1064e57b2d0eSRandall Stewart * Either the transport is off on a keep-alive 1065e57b2d0eSRandall Stewart * (it has the queue_ready flag clear and its 1066e57b2d0eSRandall Stewart * not already been woken) or the entry has 1067e57b2d0eSRandall Stewart * some urgent thing (FIN or possibly SACK blocks). 1068e57b2d0eSRandall Stewart * This means we need to wake the transport up by 1069e57b2d0eSRandall Stewart * putting it on the input pacer. 1070e57b2d0eSRandall Stewart */ 1071e57b2d0eSRandall Stewart need_wakeup = 1; 1072e57b2d0eSRandall Stewart if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) && 1073e57b2d0eSRandall Stewart (le->need_wakeup != 1)) { 1074e57b2d0eSRandall Stewart /* 1075e57b2d0eSRandall Stewart * Prohibited from a sack wakeup. 1076e57b2d0eSRandall Stewart */ 1077e57b2d0eSRandall Stewart need_wakeup = 0; 1078e57b2d0eSRandall Stewart } 1079e57b2d0eSRandall Stewart } 1080e57b2d0eSRandall Stewart /* Do we need to be awoken due to lots of data or acks? */ 1081e57b2d0eSRandall Stewart if ((le->tcp_tot_p_len >= lc->lro_length_lim) || 1082e57b2d0eSRandall Stewart (le->mbuf_cnt >= lc->lro_ackcnt_lim)) 1083e57b2d0eSRandall Stewart need_wakeup = 1; 1084e57b2d0eSRandall Stewart } 1085*69a34e8dSRandall Stewart if (inp) 1086e57b2d0eSRandall Stewart tp = intotcpcb(inp); 1087*69a34e8dSRandall Stewart else 1088e57b2d0eSRandall Stewart tp = NULL; 1089e57b2d0eSRandall Stewart if (can_queue) { 1090e57b2d0eSRandall Stewart counter_u64_add(tcp_inp_lro_direct_queue, 1); 1091e57b2d0eSRandall Stewart tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup, 1092e57b2d0eSRandall Stewart inp->inp_flags2, inp->inp_in_input, le->need_wakeup); 1093e57b2d0eSRandall Stewart tcp_queue_pkts(tp, le); 1094e57b2d0eSRandall Stewart if (need_wakeup) { 1095e57b2d0eSRandall Stewart /* 1096e57b2d0eSRandall Stewart * We must get the guy to wakeup via 1097e57b2d0eSRandall Stewart * hpts. 1098e57b2d0eSRandall Stewart */ 1099*69a34e8dSRandall Stewart NET_EPOCH_ASSERT(); 1100*69a34e8dSRandall Stewart if (le->need_wakeup == 2) { 1101*69a34e8dSRandall Stewart /* 1102*69a34e8dSRandall Stewart * The value 2 is set if the 1103*69a34e8dSRandall Stewart * options are unrecognized i.e. 1104*69a34e8dSRandall Stewart * not just a timestamp. So really 1105*69a34e8dSRandall Stewart * sack is usually what it is but 1106*69a34e8dSRandall Stewart * it might be some other option (CWR 1107*69a34e8dSRandall Stewart * etc). 1108*69a34e8dSRandall Stewart */ 1109e57b2d0eSRandall Stewart counter_u64_add(tcp_inp_lro_sack_wake, 1); 1110*69a34e8dSRandall Stewart } 1111*69a34e8dSRandall Stewart counter_u64_add(tcp_inp_lro_wokeup_queue, 1); 1112*69a34e8dSRandall Stewart if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0)) { 1113*69a34e8dSRandall Stewart inp = NULL; 1114e57b2d0eSRandall Stewart } 1115e57b2d0eSRandall Stewart } 1116*69a34e8dSRandall Stewart } 1117*69a34e8dSRandall Stewart if (inp) { 1118e57b2d0eSRandall Stewart /* Unlock it */ 1119e57b2d0eSRandall Stewart tp = NULL; 1120e57b2d0eSRandall Stewart counter_u64_add(tcp_inp_lro_locks_taken, 1); 1121e57b2d0eSRandall Stewart INP_WUNLOCK(inp); 1122e57b2d0eSRandall Stewart } 1123e57b2d0eSRandall Stewart if (can_queue == 0) { 1124e57b2d0eSRandall Stewart skip_lookup: 1125*69a34e8dSRandall Stewart if (le->strip_cnt) { 1126*69a34e8dSRandall Stewart /* 1127*69a34e8dSRandall Stewart * We have stripped mbufs, the connection 1128*69a34e8dSRandall Stewart * must have changed underneath us. You 1129*69a34e8dSRandall Stewart * loose the packets as a penalty. 1130*69a34e8dSRandall Stewart */ 1131*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1132*69a34e8dSRandall Stewart goto done; 1133*69a34e8dSRandall Stewart } 1134373013b0SConrad Meyer #endif /* TCPHPTS */ 1135e57b2d0eSRandall Stewart /* Old fashioned lro method */ 1136e57b2d0eSRandall Stewart if (le->m_head != le->m_last_mbuf) { 1137e57b2d0eSRandall Stewart counter_u64_add(tcp_inp_lro_compressed, 1); 1138*69a34e8dSRandall Stewart tcp_lro_condense(tp, lc, le); 1139e57b2d0eSRandall Stewart } else 1140e57b2d0eSRandall Stewart counter_u64_add(tcp_inp_lro_single_push, 1); 1141*69a34e8dSRandall Stewart tcp_flush_out_le(tp, lc, le); 1142e57b2d0eSRandall Stewart #ifdef TCPHPTS 1143e57b2d0eSRandall Stewart } 1144*69a34e8dSRandall Stewart done: 1145e57b2d0eSRandall Stewart CURVNET_RESTORE(); 1146e57b2d0eSRandall Stewart #endif 114762b5b6ecSBjoern A. Zeeb lc->lro_flushed++; 114862b5b6ecSBjoern A. Zeeb bzero(le, sizeof(*le)); 11491ea44822SSepherosa Ziehau LIST_INSERT_HEAD(&lc->lro_free, le, next); 115062b5b6ecSBjoern A. Zeeb } 11516c5087a8SJack F Vogel 1152fc271df3SHans Petter Selasky #ifdef HAVE_INLINE_FLSLL 1153fc271df3SHans Petter Selasky #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) 1154fc271df3SHans Petter Selasky #else 1155fc271df3SHans Petter Selasky static inline uint64_t 1156fc271df3SHans Petter Selasky tcp_lro_msb_64(uint64_t x) 1157e936121dSHans Petter Selasky { 1158fc271df3SHans Petter Selasky x |= (x >> 1); 1159fc271df3SHans Petter Selasky x |= (x >> 2); 1160fc271df3SHans Petter Selasky x |= (x >> 4); 1161fc271df3SHans Petter Selasky x |= (x >> 8); 1162fc271df3SHans Petter Selasky x |= (x >> 16); 1163fc271df3SHans Petter Selasky x |= (x >> 32); 1164fc271df3SHans Petter Selasky return (x & ~(x >> 1)); 1165fc271df3SHans Petter Selasky } 1166fc271df3SHans Petter Selasky #endif 1167e936121dSHans Petter Selasky 1168fc271df3SHans Petter Selasky /* 1169fc271df3SHans Petter Selasky * The tcp_lro_sort() routine is comparable to qsort(), except it has 1170fc271df3SHans Petter Selasky * a worst case complexity limit of O(MIN(N,64)*N), where N is the 1171fc271df3SHans Petter Selasky * number of elements to sort and 64 is the number of sequence bits 1172fc271df3SHans Petter Selasky * available. The algorithm is bit-slicing the 64-bit sequence number, 1173fc271df3SHans Petter Selasky * sorting one bit at a time from the most significant bit until the 1174ec668905SHans Petter Selasky * least significant one, skipping the constant bits. This is 1175ec668905SHans Petter Selasky * typically called a radix sort. 1176fc271df3SHans Petter Selasky */ 1177fc271df3SHans Petter Selasky static void 1178fc271df3SHans Petter Selasky tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) 1179fc271df3SHans Petter Selasky { 1180fc271df3SHans Petter Selasky struct lro_mbuf_sort temp; 1181fc271df3SHans Petter Selasky uint64_t ones; 1182fc271df3SHans Petter Selasky uint64_t zeros; 1183fc271df3SHans Petter Selasky uint32_t x; 1184fc271df3SHans Petter Selasky uint32_t y; 1185e936121dSHans Petter Selasky 1186fc271df3SHans Petter Selasky repeat: 1187ec668905SHans Petter Selasky /* for small arrays insertion sort is faster */ 1188fc271df3SHans Petter Selasky if (size <= 12) { 1189ec668905SHans Petter Selasky for (x = 1; x < size; x++) { 1190fc271df3SHans Petter Selasky temp = parray[x]; 1191ec668905SHans Petter Selasky for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) 1192ec668905SHans Petter Selasky parray[y] = parray[y - 1]; 1193fc271df3SHans Petter Selasky parray[y] = temp; 1194fc271df3SHans Petter Selasky } 1195fc271df3SHans Petter Selasky return; 1196fc271df3SHans Petter Selasky } 1197e936121dSHans Petter Selasky 1198fc271df3SHans Petter Selasky /* compute sequence bits which are constant */ 1199fc271df3SHans Petter Selasky ones = 0; 1200fc271df3SHans Petter Selasky zeros = 0; 1201fc271df3SHans Petter Selasky for (x = 0; x != size; x++) { 1202fc271df3SHans Petter Selasky ones |= parray[x].seq; 1203fc271df3SHans Petter Selasky zeros |= ~parray[x].seq; 1204fc271df3SHans Petter Selasky } 1205fc271df3SHans Petter Selasky 1206fc271df3SHans Petter Selasky /* compute bits which are not constant into "ones" */ 1207fc271df3SHans Petter Selasky ones &= zeros; 1208fc271df3SHans Petter Selasky if (ones == 0) 1209fc271df3SHans Petter Selasky return; 1210fc271df3SHans Petter Selasky 1211fc271df3SHans Petter Selasky /* pick the most significant bit which is not constant */ 1212fc271df3SHans Petter Selasky ones = tcp_lro_msb_64(ones); 1213fc271df3SHans Petter Selasky 1214fc271df3SHans Petter Selasky /* 1215fc271df3SHans Petter Selasky * Move entries having cleared sequence bits to the beginning 1216fc271df3SHans Petter Selasky * of the array: 1217fc271df3SHans Petter Selasky */ 1218fc271df3SHans Petter Selasky for (x = y = 0; y != size; y++) { 1219fc271df3SHans Petter Selasky /* skip set bits */ 1220fc271df3SHans Petter Selasky if (parray[y].seq & ones) 1221fc271df3SHans Petter Selasky continue; 1222fc271df3SHans Petter Selasky /* swap entries */ 1223fc271df3SHans Petter Selasky temp = parray[x]; 1224fc271df3SHans Petter Selasky parray[x] = parray[y]; 1225fc271df3SHans Petter Selasky parray[y] = temp; 1226fc271df3SHans Petter Selasky x++; 1227fc271df3SHans Petter Selasky } 1228fc271df3SHans Petter Selasky 1229fc271df3SHans Petter Selasky KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); 1230fc271df3SHans Petter Selasky 1231fc271df3SHans Petter Selasky /* sort zeros */ 1232fc271df3SHans Petter Selasky tcp_lro_sort(parray, x); 1233fc271df3SHans Petter Selasky 1234fc271df3SHans Petter Selasky /* sort ones */ 1235fc271df3SHans Petter Selasky parray += x; 1236fc271df3SHans Petter Selasky size -= x; 1237fc271df3SHans Petter Selasky goto repeat; 1238e936121dSHans Petter Selasky } 1239e936121dSHans Petter Selasky 1240e936121dSHans Petter Selasky void 1241e936121dSHans Petter Selasky tcp_lro_flush_all(struct lro_ctrl *lc) 1242e936121dSHans Petter Selasky { 1243fc271df3SHans Petter Selasky uint64_t seq; 1244fc271df3SHans Petter Selasky uint64_t nseq; 1245e936121dSHans Petter Selasky unsigned x; 1246e936121dSHans Petter Selasky 1247e936121dSHans Petter Selasky /* check if no mbufs to flush */ 12486dd38b87SSepherosa Ziehau if (lc->lro_mbuf_count == 0) 1249e936121dSHans Petter Selasky goto done; 1250e936121dSHans Petter Selasky 1251*69a34e8dSRandall Stewart microuptime(&lc->lro_last_flush); 1252e936121dSHans Petter Selasky /* sort all mbufs according to stream */ 1253fc271df3SHans Petter Selasky tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); 1254e936121dSHans Petter Selasky 1255e936121dSHans Petter Selasky /* input data into LRO engine, stream by stream */ 1256fc271df3SHans Petter Selasky seq = 0; 1257e936121dSHans Petter Selasky for (x = 0; x != lc->lro_mbuf_count; x++) { 1258e936121dSHans Petter Selasky struct mbuf *mb; 1259e936121dSHans Petter Selasky 1260fc271df3SHans Petter Selasky /* get mbuf */ 1261fc271df3SHans Petter Selasky mb = lc->lro_mbuf_data[x].mb; 1262fc271df3SHans Petter Selasky 1263fc271df3SHans Petter Selasky /* get sequence number, masking away the packet index */ 1264fc271df3SHans Petter Selasky nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); 1265e936121dSHans Petter Selasky 1266e936121dSHans Petter Selasky /* check for new stream */ 1267fc271df3SHans Petter Selasky if (seq != nseq) { 1268fc271df3SHans Petter Selasky seq = nseq; 1269e936121dSHans Petter Selasky 1270e936121dSHans Petter Selasky /* flush active streams */ 12716dd38b87SSepherosa Ziehau tcp_lro_rx_done(lc); 1272e936121dSHans Petter Selasky } 1273fc271df3SHans Petter Selasky 1274e936121dSHans Petter Selasky /* add packet to LRO engine */ 127505cde7efSSepherosa Ziehau if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { 1276e936121dSHans Petter Selasky /* input packet to network layer */ 1277e936121dSHans Petter Selasky (*lc->ifp->if_input)(lc->ifp, mb); 1278e936121dSHans Petter Selasky lc->lro_queued++; 1279e936121dSHans Petter Selasky lc->lro_flushed++; 1280e936121dSHans Petter Selasky } 1281e936121dSHans Petter Selasky } 1282e936121dSHans Petter Selasky done: 1283e936121dSHans Petter Selasky /* flush active streams */ 12846dd38b87SSepherosa Ziehau tcp_lro_rx_done(lc); 12856dd38b87SSepherosa Ziehau 1286e936121dSHans Petter Selasky lc->lro_mbuf_count = 0; 1287e936121dSHans Petter Selasky } 1288e936121dSHans Petter Selasky 1289e57b2d0eSRandall Stewart static void 1290e57b2d0eSRandall Stewart lro_set_mtime(struct timeval *tv, struct timespec *ts) 129162b5b6ecSBjoern A. Zeeb { 1292e57b2d0eSRandall Stewart tv->tv_sec = ts->tv_sec; 1293e57b2d0eSRandall Stewart tv->tv_usec = ts->tv_nsec / 1000; 129462b5b6ecSBjoern A. Zeeb } 12956c5087a8SJack F Vogel 1296*69a34e8dSRandall Stewart static void 1297*69a34e8dSRandall Stewart build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, uint16_t hdr_len, uint16_t iptos) 1298*69a34e8dSRandall Stewart { 1299*69a34e8dSRandall Stewart /* 1300*69a34e8dSRandall Stewart * Given a TCP ack, summarize it down into the small tcp 1301*69a34e8dSRandall Stewart * ack entry. 1302*69a34e8dSRandall Stewart */ 1303*69a34e8dSRandall Stewart u_char *cp; 1304*69a34e8dSRandall Stewart 1305*69a34e8dSRandall Stewart 1306*69a34e8dSRandall Stewart KASSERT(((th->th_flags & ~(TH_ACK | TH_PUSH | TH_CWR | TH_ECE)) == 0), 1307*69a34e8dSRandall Stewart ("tcphdr:%p mbuf:%p has unallowed bits %x", th, m, th->th_flags)); 1308*69a34e8dSRandall Stewart ae->timestamp = m->m_pkthdr.rcv_tstmp; 1309*69a34e8dSRandall Stewart if (m->m_flags & M_TSTMP_LRO) 1310*69a34e8dSRandall Stewart ae->flags = TSTMP_LRO; 1311*69a34e8dSRandall Stewart else if (m->m_flags & M_TSTMP) 1312*69a34e8dSRandall Stewart ae->flags = TSTMP_HDWR; 1313*69a34e8dSRandall Stewart ae->seq = ntohl(th->th_seq); 1314*69a34e8dSRandall Stewart ae->ack = ntohl(th->th_ack); 1315*69a34e8dSRandall Stewart ae->flags |= th->th_flags; 1316*69a34e8dSRandall Stewart if (hdr_len) { 1317*69a34e8dSRandall Stewart /* We have a timestamp options get out the bits */ 1318*69a34e8dSRandall Stewart cp = (u_char *)(th + 1); 1319*69a34e8dSRandall Stewart /* Skip the two NOP's at the front */ 1320*69a34e8dSRandall Stewart while (*cp == TCPOPT_NOP) 1321*69a34e8dSRandall Stewart cp++; 1322*69a34e8dSRandall Stewart KASSERT(((*cp == TCPOPT_TIMESTAMP) && 1323*69a34e8dSRandall Stewart (cp[1] == TCPOLEN_TIMESTAMP)), 1324*69a34e8dSRandall Stewart ("At %p in tcphdr:%p options of %d not timestamp", 1325*69a34e8dSRandall Stewart cp, th, hdr_len)); 1326*69a34e8dSRandall Stewart bcopy((char *)cp + 2, 1327*69a34e8dSRandall Stewart (char *)&ae->ts_value, sizeof(uint32_t)); 1328*69a34e8dSRandall Stewart ae->ts_value = ntohl(ae->ts_value); 1329*69a34e8dSRandall Stewart bcopy((char *)cp + 6, 1330*69a34e8dSRandall Stewart (char *)&ae->ts_echo, sizeof(uint32_t)); 1331*69a34e8dSRandall Stewart ae->ts_echo = ntohl(ae->ts_echo); 1332*69a34e8dSRandall Stewart ae->flags |= HAS_TSTMP; 1333*69a34e8dSRandall Stewart } 1334*69a34e8dSRandall Stewart ae->win = ntohs(th->th_win); 1335*69a34e8dSRandall Stewart ae->codepoint = iptos; 1336*69a34e8dSRandall Stewart } 1337*69a34e8dSRandall Stewart 1338*69a34e8dSRandall Stewart static struct mbuf * 1339*69a34e8dSRandall Stewart do_bpf_and_csum(struct inpcb *inp, struct lro_ctrl *lc, struct lro_entry *le, 1340*69a34e8dSRandall Stewart struct ether_header *eh, struct mbuf *m, int bpf_req, int locked) 1341*69a34e8dSRandall Stewart { 1342*69a34e8dSRandall Stewart /* 1343*69a34e8dSRandall Stewart * Do TCP/IP checksum and BPF tap for either ACK_CMP packets or 1344*69a34e8dSRandall Stewart * MBUF QUEUE type packets. 1345*69a34e8dSRandall Stewart */ 1346*69a34e8dSRandall Stewart struct tcphdr *th; 1347*69a34e8dSRandall Stewart #ifdef INET6 1348*69a34e8dSRandall Stewart struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 1349*69a34e8dSRandall Stewart #endif 1350*69a34e8dSRandall Stewart #ifdef INET 1351*69a34e8dSRandall Stewart struct ip *ip = NULL; /* Keep compiler happy. */ 1352*69a34e8dSRandall Stewart #endif 1353*69a34e8dSRandall Stewart 1354*69a34e8dSRandall Stewart uint16_t drop_hdrlen; 1355*69a34e8dSRandall Stewart int etype, tlen; 1356*69a34e8dSRandall Stewart uint8_t iptos; 1357*69a34e8dSRandall Stewart 1358*69a34e8dSRandall Stewart /* Let the BPF see the packet */ 1359*69a34e8dSRandall Stewart if (bpf_req && lc->ifp) 1360*69a34e8dSRandall Stewart ETHER_BPF_MTAP(lc->ifp, m); 1361*69a34e8dSRandall Stewart /* Get type and Trim off the ethernet header */ 1362*69a34e8dSRandall Stewart m->m_pkthdr.lro_etype = etype = ntohs(eh->ether_type); 1363*69a34e8dSRandall Stewart m_adj(m, sizeof(*eh)); 1364*69a34e8dSRandall Stewart m->m_flags |= M_LRO_EHDRSTRP; 1365*69a34e8dSRandall Stewart switch (etype) { 1366*69a34e8dSRandall Stewart #ifdef INET6 1367*69a34e8dSRandall Stewart case ETHERTYPE_IPV6: 1368*69a34e8dSRandall Stewart { 1369*69a34e8dSRandall Stewart if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { 1370*69a34e8dSRandall Stewart m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); 1371*69a34e8dSRandall Stewart if (m == NULL) { 1372*69a34e8dSRandall Stewart TCPSTAT_INC(tcps_rcvshort); 1373*69a34e8dSRandall Stewart m_freem(m); 1374*69a34e8dSRandall Stewart return (NULL); 1375*69a34e8dSRandall Stewart } 1376*69a34e8dSRandall Stewart } 1377*69a34e8dSRandall Stewart ip6 = (struct ip6_hdr *)(eh + 1); 1378*69a34e8dSRandall Stewart th = (struct tcphdr *)(ip6 + 1); 1379*69a34e8dSRandall Stewart tlen = ntohs(ip6->ip6_plen); 1380*69a34e8dSRandall Stewart drop_hdrlen = sizeof(*ip6); 1381*69a34e8dSRandall Stewart if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { 1382*69a34e8dSRandall Stewart if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 1383*69a34e8dSRandall Stewart counter_u64_add(tcp_csum_hardware_w_ph, 1); 1384*69a34e8dSRandall Stewart th->th_sum = m->m_pkthdr.csum_data; 1385*69a34e8dSRandall Stewart } else { 1386*69a34e8dSRandall Stewart counter_u64_add(tcp_csum_hardware, 1); 1387*69a34e8dSRandall Stewart th->th_sum = in6_cksum_pseudo(ip6, tlen, 1388*69a34e8dSRandall Stewart IPPROTO_TCP, m->m_pkthdr.csum_data); 1389*69a34e8dSRandall Stewart } 1390*69a34e8dSRandall Stewart th->th_sum ^= 0xffff; 1391*69a34e8dSRandall Stewart } else { 1392*69a34e8dSRandall Stewart counter_u64_add(tcp_csum_software, 1); 1393*69a34e8dSRandall Stewart th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); 1394*69a34e8dSRandall Stewart } 1395*69a34e8dSRandall Stewart if (th->th_sum) { 1396*69a34e8dSRandall Stewart TCPSTAT_INC(tcps_rcvbadsum); 1397*69a34e8dSRandall Stewart if (locked) { 1398*69a34e8dSRandall Stewart /* Log the bad news */ 1399*69a34e8dSRandall Stewart struct tcpcb *tp = intotcpcb(inp); 1400*69a34e8dSRandall Stewart 1401*69a34e8dSRandall Stewart tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); 1402*69a34e8dSRandall Stewart } 1403*69a34e8dSRandall Stewart m_freem(m); 1404*69a34e8dSRandall Stewart return (NULL); 1405*69a34e8dSRandall Stewart } 1406*69a34e8dSRandall Stewart 1407*69a34e8dSRandall Stewart 1408*69a34e8dSRandall Stewart 1409*69a34e8dSRandall Stewart 1410*69a34e8dSRandall Stewart /* 1411*69a34e8dSRandall Stewart * Be proactive about unspecified IPv6 address in source. 1412*69a34e8dSRandall Stewart * As we use all-zero to indicate unbounded/unconnected pcb, 1413*69a34e8dSRandall Stewart * unspecified IPv6 address can be used to confuse us. 1414*69a34e8dSRandall Stewart * 1415*69a34e8dSRandall Stewart * Note that packets with unspecified IPv6 destination is 1416*69a34e8dSRandall Stewart * already dropped in ip6_input. 1417*69a34e8dSRandall Stewart */ 1418*69a34e8dSRandall Stewart if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1419*69a34e8dSRandall Stewart /* XXX stat */ 1420*69a34e8dSRandall Stewart m_freem(m); 1421*69a34e8dSRandall Stewart return (NULL); 1422*69a34e8dSRandall Stewart } 1423*69a34e8dSRandall Stewart break; 1424*69a34e8dSRandall Stewart } 1425*69a34e8dSRandall Stewart #endif 1426*69a34e8dSRandall Stewart #ifdef INET 1427*69a34e8dSRandall Stewart case ETHERTYPE_IP: 1428*69a34e8dSRandall Stewart { 1429*69a34e8dSRandall Stewart if (m->m_len < sizeof (struct tcpiphdr)) { 1430*69a34e8dSRandall Stewart if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 1431*69a34e8dSRandall Stewart == NULL) { 1432*69a34e8dSRandall Stewart TCPSTAT_INC(tcps_rcvshort); 1433*69a34e8dSRandall Stewart m_freem(m); 1434*69a34e8dSRandall Stewart return (NULL); 1435*69a34e8dSRandall Stewart } 1436*69a34e8dSRandall Stewart } 1437*69a34e8dSRandall Stewart ip = (struct ip *)(eh + 1); 1438*69a34e8dSRandall Stewart th = (struct tcphdr *)(ip + 1); 1439*69a34e8dSRandall Stewart iptos = ip->ip_tos; 1440*69a34e8dSRandall Stewart drop_hdrlen = sizeof(*ip); 1441*69a34e8dSRandall Stewart tlen = ntohs(ip->ip_len) - sizeof(struct ip); 1442*69a34e8dSRandall Stewart if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 1443*69a34e8dSRandall Stewart if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 1444*69a34e8dSRandall Stewart counter_u64_add(tcp_csum_hardware_w_ph, 1); 1445*69a34e8dSRandall Stewart th->th_sum = m->m_pkthdr.csum_data; 1446*69a34e8dSRandall Stewart } else { 1447*69a34e8dSRandall Stewart counter_u64_add(tcp_csum_hardware, 1); 1448*69a34e8dSRandall Stewart th->th_sum = in_pseudo(ip->ip_src.s_addr, 1449*69a34e8dSRandall Stewart ip->ip_dst.s_addr, 1450*69a34e8dSRandall Stewart htonl(m->m_pkthdr.csum_data + tlen + 1451*69a34e8dSRandall Stewart IPPROTO_TCP)); 1452*69a34e8dSRandall Stewart } 1453*69a34e8dSRandall Stewart th->th_sum ^= 0xffff; 1454*69a34e8dSRandall Stewart } else { 1455*69a34e8dSRandall Stewart int len; 1456*69a34e8dSRandall Stewart struct ipovly *ipov = (struct ipovly *)ip; 1457*69a34e8dSRandall Stewart /* 1458*69a34e8dSRandall Stewart * Checksum extended TCP header and data. 1459*69a34e8dSRandall Stewart */ 1460*69a34e8dSRandall Stewart counter_u64_add(tcp_csum_software, 1); 1461*69a34e8dSRandall Stewart len = drop_hdrlen + tlen; 1462*69a34e8dSRandall Stewart bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 1463*69a34e8dSRandall Stewart ipov->ih_len = htons(tlen); 1464*69a34e8dSRandall Stewart th->th_sum = in_cksum(m, len); 1465*69a34e8dSRandall Stewart /* Reset length for SDT probes. */ 1466*69a34e8dSRandall Stewart ip->ip_len = htons(len); 1467*69a34e8dSRandall Stewart /* Reset TOS bits */ 1468*69a34e8dSRandall Stewart ip->ip_tos = iptos; 1469*69a34e8dSRandall Stewart /* Re-initialization for later version check */ 1470*69a34e8dSRandall Stewart ip->ip_v = IPVERSION; 1471*69a34e8dSRandall Stewart ip->ip_hl = sizeof(*ip) >> 2; 1472*69a34e8dSRandall Stewart } 1473*69a34e8dSRandall Stewart if (th->th_sum) { 1474*69a34e8dSRandall Stewart TCPSTAT_INC(tcps_rcvbadsum); 1475*69a34e8dSRandall Stewart if (locked) { 1476*69a34e8dSRandall Stewart /* Log the bad news */ 1477*69a34e8dSRandall Stewart struct tcpcb *tp = intotcpcb(inp); 1478*69a34e8dSRandall Stewart 1479*69a34e8dSRandall Stewart tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); 1480*69a34e8dSRandall Stewart } 1481*69a34e8dSRandall Stewart m_freem(m); 1482*69a34e8dSRandall Stewart return (NULL); 1483*69a34e8dSRandall Stewart } 1484*69a34e8dSRandall Stewart break; 1485*69a34e8dSRandall Stewart } 1486*69a34e8dSRandall Stewart #endif 1487*69a34e8dSRandall Stewart } /* end switch */ 1488*69a34e8dSRandall Stewart return (m); 1489*69a34e8dSRandall Stewart } 1490*69a34e8dSRandall Stewart 149105cde7efSSepherosa Ziehau static int 149205cde7efSSepherosa Ziehau tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) 149362b5b6ecSBjoern A. Zeeb { 149462b5b6ecSBjoern A. Zeeb struct lro_entry *le; 149562b5b6ecSBjoern A. Zeeb struct ether_header *eh; 149662b5b6ecSBjoern A. Zeeb #ifdef INET6 149762b5b6ecSBjoern A. Zeeb struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 149862b5b6ecSBjoern A. Zeeb #endif 149962b5b6ecSBjoern A. Zeeb #ifdef INET 150062b5b6ecSBjoern A. Zeeb struct ip *ip4 = NULL; /* Keep compiler happy. */ 150162b5b6ecSBjoern A. Zeeb #endif 150262b5b6ecSBjoern A. Zeeb struct tcphdr *th; 150362b5b6ecSBjoern A. Zeeb void *l3hdr = NULL; /* Keep compiler happy. */ 150462b5b6ecSBjoern A. Zeeb uint32_t *ts_ptr; 150562b5b6ecSBjoern A. Zeeb tcp_seq seq; 1506*69a34e8dSRandall Stewart int error, ip_len, hdr_len, locked = 0; 1507e57b2d0eSRandall Stewart uint16_t eh_type, tcp_data_len, need_flush; 1508*69a34e8dSRandall Stewart #ifdef TCPHPTS 1509*69a34e8dSRandall Stewart uint16_t iptos; 1510*69a34e8dSRandall Stewart #endif 151105cde7efSSepherosa Ziehau struct lro_head *bucket; 1512e57b2d0eSRandall Stewart struct timespec arrv; 15136c5087a8SJack F Vogel 1514*69a34e8dSRandall Stewart /* Clear the flags we may use to communicate with TCP */ 1515*69a34e8dSRandall Stewart m->m_flags &= ~(M_ACKCMP|M_LRO_EHDRSTRP); 1516*69a34e8dSRandall Stewart 151762b5b6ecSBjoern A. Zeeb /* We expect a contiguous header [eh, ip, tcp]. */ 1518e57b2d0eSRandall Stewart if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) { 1519e57b2d0eSRandall Stewart /* If no hardware or arrival stamp on the packet add arrival */ 1520e57b2d0eSRandall Stewart nanouptime(&arrv); 1521e57b2d0eSRandall Stewart m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec; 1522e57b2d0eSRandall Stewart m->m_flags |= M_TSTMP_LRO; 1523e57b2d0eSRandall Stewart } 152462b5b6ecSBjoern A. Zeeb eh = mtod(m, struct ether_header *); 152562b5b6ecSBjoern A. Zeeb eh_type = ntohs(eh->ether_type); 152662b5b6ecSBjoern A. Zeeb switch (eh_type) { 152762b5b6ecSBjoern A. Zeeb #ifdef INET6 152862b5b6ecSBjoern A. Zeeb case ETHERTYPE_IPV6: 15295fa2656eSBjoern A. Zeeb { 15305fa2656eSBjoern A. Zeeb CURVNET_SET(lc->ifp->if_vnet); 153131bfc56eSBjoern A. Zeeb if (V_ip6_forwarding != 0) { 153231bfc56eSBjoern A. Zeeb /* XXX-BZ stats but changing lro_ctrl is a problem. */ 15335fa2656eSBjoern A. Zeeb CURVNET_RESTORE(); 153431bfc56eSBjoern A. Zeeb return (TCP_LRO_CANNOT); 153531bfc56eSBjoern A. Zeeb } 15365fa2656eSBjoern A. Zeeb CURVNET_RESTORE(); 153762b5b6ecSBjoern A. Zeeb l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 153862b5b6ecSBjoern A. Zeeb error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 153962b5b6ecSBjoern A. Zeeb if (error != 0) 154062b5b6ecSBjoern A. Zeeb return (error); 154162b5b6ecSBjoern A. Zeeb tcp_data_len = ntohs(ip6->ip6_plen); 1542*69a34e8dSRandall Stewart #ifdef TCPHPTS 1543*69a34e8dSRandall Stewart iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1544*69a34e8dSRandall Stewart #endif 154562b5b6ecSBjoern A. Zeeb ip_len = sizeof(*ip6) + tcp_data_len; 154662b5b6ecSBjoern A. Zeeb break; 15475fa2656eSBjoern A. Zeeb } 154862b5b6ecSBjoern A. Zeeb #endif 154962b5b6ecSBjoern A. Zeeb #ifdef INET 155062b5b6ecSBjoern A. Zeeb case ETHERTYPE_IP: 15515fa2656eSBjoern A. Zeeb { 15525fa2656eSBjoern A. Zeeb CURVNET_SET(lc->ifp->if_vnet); 155331bfc56eSBjoern A. Zeeb if (V_ipforwarding != 0) { 155431bfc56eSBjoern A. Zeeb /* XXX-BZ stats but changing lro_ctrl is a problem. */ 15555fa2656eSBjoern A. Zeeb CURVNET_RESTORE(); 155631bfc56eSBjoern A. Zeeb return (TCP_LRO_CANNOT); 155731bfc56eSBjoern A. Zeeb } 15585fa2656eSBjoern A. Zeeb CURVNET_RESTORE(); 155962b5b6ecSBjoern A. Zeeb l3hdr = ip4 = (struct ip *)(eh + 1); 156062b5b6ecSBjoern A. Zeeb error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 156162b5b6ecSBjoern A. Zeeb if (error != 0) 156262b5b6ecSBjoern A. Zeeb return (error); 156362b5b6ecSBjoern A. Zeeb ip_len = ntohs(ip4->ip_len); 1564*69a34e8dSRandall Stewart #ifdef TCPHPTS 1565*69a34e8dSRandall Stewart iptos = ip4->ip_tos; 1566*69a34e8dSRandall Stewart #endif 156762b5b6ecSBjoern A. Zeeb tcp_data_len = ip_len - sizeof(*ip4); 156862b5b6ecSBjoern A. Zeeb break; 15695fa2656eSBjoern A. Zeeb } 157062b5b6ecSBjoern A. Zeeb #endif 157162b5b6ecSBjoern A. Zeeb /* XXX-BZ what happens in case of VLAN(s)? */ 157262b5b6ecSBjoern A. Zeeb default: 157362b5b6ecSBjoern A. Zeeb return (TCP_LRO_NOT_SUPPORTED); 157462b5b6ecSBjoern A. Zeeb } 15756c5087a8SJack F Vogel 15766c5087a8SJack F Vogel /* 157762b5b6ecSBjoern A. Zeeb * If the frame is padded beyond the end of the IP packet, then we must 157862b5b6ecSBjoern A. Zeeb * trim the extra bytes off. 15796c5087a8SJack F Vogel */ 1580*69a34e8dSRandall Stewart hdr_len = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); 1581*69a34e8dSRandall Stewart if (hdr_len != 0) { 1582*69a34e8dSRandall Stewart if (hdr_len < 0) 158362b5b6ecSBjoern A. Zeeb /* Truncated packet. */ 158462b5b6ecSBjoern A. Zeeb return (TCP_LRO_CANNOT); 158562b5b6ecSBjoern A. Zeeb 1586*69a34e8dSRandall Stewart m_adj(m, -hdr_len); 15876c5087a8SJack F Vogel } 158862b5b6ecSBjoern A. Zeeb /* 158962b5b6ecSBjoern A. Zeeb * Check TCP header constraints. 159062b5b6ecSBjoern A. Zeeb */ 1591*69a34e8dSRandall Stewart hdr_len = (th->th_off << 2); 1592*69a34e8dSRandall Stewart ts_ptr = (uint32_t *)(th + 1); 1593*69a34e8dSRandall Stewart tcp_data_len -= hdr_len; 1594*69a34e8dSRandall Stewart hdr_len -= sizeof(*th); 1595b9ec6f0bSSepherosa Ziehau if (th->th_flags & TH_SYN) 159662b5b6ecSBjoern A. Zeeb return (TCP_LRO_CANNOT); 1597*69a34e8dSRandall Stewart if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 1598e57b2d0eSRandall Stewart need_flush = 1; 1599*69a34e8dSRandall Stewart } else 1600e57b2d0eSRandall Stewart need_flush = 0; 1601*69a34e8dSRandall Stewart if (hdr_len != 0 && (__predict_false(hdr_len != TCPOLEN_TSTAMP_APPA) || 160262b5b6ecSBjoern A. Zeeb (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 1603b9ec6f0bSSepherosa Ziehau TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 1604b9ec6f0bSSepherosa Ziehau /* 1605e57b2d0eSRandall Stewart * We have an option besides Timestamps, maybe 1606e57b2d0eSRandall Stewart * it is a sack (most likely) which means we 1607e57b2d0eSRandall Stewart * will probably need to wake up a sleeper (if 1608e57b2d0eSRandall Stewart * the guy does queueing). 1609b9ec6f0bSSepherosa Ziehau */ 1610e57b2d0eSRandall Stewart need_flush = 2; 1611b9ec6f0bSSepherosa Ziehau } 161262b5b6ecSBjoern A. Zeeb /* If the driver did not pass in the checksum, set it now. */ 161362b5b6ecSBjoern A. Zeeb if (csum == 0x0000) 161462b5b6ecSBjoern A. Zeeb csum = th->th_sum; 161562b5b6ecSBjoern A. Zeeb seq = ntohl(th->th_seq); 161605cde7efSSepherosa Ziehau if (!use_hash) { 161705cde7efSSepherosa Ziehau bucket = &lc->lro_hash[0]; 161805cde7efSSepherosa Ziehau } else if (M_HASHTYPE_ISHASH(m)) { 161905cde7efSSepherosa Ziehau bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; 162005cde7efSSepherosa Ziehau } else { 162105cde7efSSepherosa Ziehau uint32_t hash; 162205cde7efSSepherosa Ziehau 162305cde7efSSepherosa Ziehau switch (eh_type) { 162405cde7efSSepherosa Ziehau #ifdef INET 162505cde7efSSepherosa Ziehau case ETHERTYPE_IP: 162605cde7efSSepherosa Ziehau hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; 162705cde7efSSepherosa Ziehau break; 162805cde7efSSepherosa Ziehau #endif 162905cde7efSSepherosa Ziehau #ifdef INET6 163005cde7efSSepherosa Ziehau case ETHERTYPE_IPV6: 163105cde7efSSepherosa Ziehau hash = ip6->ip6_src.s6_addr32[0] + 163205cde7efSSepherosa Ziehau ip6->ip6_dst.s6_addr32[0]; 163305cde7efSSepherosa Ziehau hash += ip6->ip6_src.s6_addr32[1] + 163405cde7efSSepherosa Ziehau ip6->ip6_dst.s6_addr32[1]; 163505cde7efSSepherosa Ziehau hash += ip6->ip6_src.s6_addr32[2] + 163605cde7efSSepherosa Ziehau ip6->ip6_dst.s6_addr32[2]; 163705cde7efSSepherosa Ziehau hash += ip6->ip6_src.s6_addr32[3] + 163805cde7efSSepherosa Ziehau ip6->ip6_dst.s6_addr32[3]; 163905cde7efSSepherosa Ziehau break; 164005cde7efSSepherosa Ziehau #endif 164105cde7efSSepherosa Ziehau default: 164205cde7efSSepherosa Ziehau hash = 0; 164305cde7efSSepherosa Ziehau break; 164405cde7efSSepherosa Ziehau } 164505cde7efSSepherosa Ziehau hash += th->th_sport + th->th_dport; 164605cde7efSSepherosa Ziehau bucket = &lc->lro_hash[hash % lc->lro_hashsz]; 164705cde7efSSepherosa Ziehau } 164805cde7efSSepherosa Ziehau 164962b5b6ecSBjoern A. Zeeb /* Try to find a matching previous segment. */ 165005cde7efSSepherosa Ziehau LIST_FOREACH(le, bucket, hash_next) { 165162b5b6ecSBjoern A. Zeeb if (le->eh_type != eh_type) 165262b5b6ecSBjoern A. Zeeb continue; 165362b5b6ecSBjoern A. Zeeb if (le->source_port != th->th_sport || 165462b5b6ecSBjoern A. Zeeb le->dest_port != th->th_dport) 165562b5b6ecSBjoern A. Zeeb continue; 165662b5b6ecSBjoern A. Zeeb switch (eh_type) { 165762b5b6ecSBjoern A. Zeeb #ifdef INET6 165862b5b6ecSBjoern A. Zeeb case ETHERTYPE_IPV6: 165962b5b6ecSBjoern A. Zeeb if (bcmp(&le->source_ip6, &ip6->ip6_src, 166062b5b6ecSBjoern A. Zeeb sizeof(struct in6_addr)) != 0 || 166162b5b6ecSBjoern A. Zeeb bcmp(&le->dest_ip6, &ip6->ip6_dst, 166262b5b6ecSBjoern A. Zeeb sizeof(struct in6_addr)) != 0) 166362b5b6ecSBjoern A. Zeeb continue; 166462b5b6ecSBjoern A. Zeeb break; 166562b5b6ecSBjoern A. Zeeb #endif 166662b5b6ecSBjoern A. Zeeb #ifdef INET 166762b5b6ecSBjoern A. Zeeb case ETHERTYPE_IP: 166862b5b6ecSBjoern A. Zeeb if (le->source_ip4 != ip4->ip_src.s_addr || 166962b5b6ecSBjoern A. Zeeb le->dest_ip4 != ip4->ip_dst.s_addr) 167062b5b6ecSBjoern A. Zeeb continue; 167162b5b6ecSBjoern A. Zeeb break; 167262b5b6ecSBjoern A. Zeeb #endif 16736c5087a8SJack F Vogel } 1674e57b2d0eSRandall Stewart if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) || 1675e57b2d0eSRandall Stewart (th->th_ack == le->ack_seq)) { 1676e57b2d0eSRandall Stewart m->m_pkthdr.lro_len = tcp_data_len; 1677d7fb35d1SSean Bruno } else { 1678d7fb35d1SSean Bruno /* no data and old ack */ 1679d7fb35d1SSean Bruno m_freem(m); 1680d7fb35d1SSean Bruno return (0); 1681d7fb35d1SSean Bruno } 1682*69a34e8dSRandall Stewart #ifdef TCPHPTS 1683*69a34e8dSRandall Stewart if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) 1684*69a34e8dSRandall Stewart goto skip_lookup_a; 1685*69a34e8dSRandall Stewart if (le->inp == NULL) { 1686*69a34e8dSRandall Stewart CURVNET_SET(lc->ifp->if_vnet); 1687*69a34e8dSRandall Stewart le->inp = tcp_lro_lookup(lc, le); 1688*69a34e8dSRandall Stewart if (le->inp) { 1689*69a34e8dSRandall Stewart in_pcbref(le->inp); 1690*69a34e8dSRandall Stewart locked = 1; 1691*69a34e8dSRandall Stewart } 1692*69a34e8dSRandall Stewart CURVNET_RESTORE(); 1693*69a34e8dSRandall Stewart } else if (le->inp) { 1694*69a34e8dSRandall Stewart INP_WLOCK(le->inp); 1695*69a34e8dSRandall Stewart locked = 1; 1696*69a34e8dSRandall Stewart } 1697*69a34e8dSRandall Stewart if (locked && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1698*69a34e8dSRandall Stewart (le->inp->inp_flags2 & INP_FREED))) { 1699*69a34e8dSRandall Stewart /* We can't present these to the inp since 1700*69a34e8dSRandall Stewart * its dead Jim. 1701*69a34e8dSRandall Stewart */ 1702*69a34e8dSRandall Stewart int ret; 1703*69a34e8dSRandall Stewart 1704*69a34e8dSRandall Stewart ret = in_pcbrele_wlocked(le->inp); 1705*69a34e8dSRandall Stewart if (ret == 0) 1706*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 1707*69a34e8dSRandall Stewart le->inp = NULL; 1708*69a34e8dSRandall Stewart locked = 0; 1709*69a34e8dSRandall Stewart tcp_lro_active_remove(le); 1710*69a34e8dSRandall Stewart if (le->strip_cnt && le->m_head) { 1711*69a34e8dSRandall Stewart /* 1712*69a34e8dSRandall Stewart * If we have any stripped packets we 1713*69a34e8dSRandall Stewart * just dump the whole chain. The 1714*69a34e8dSRandall Stewart * tcp_lro_flush code knows how 1715*69a34e8dSRandall Stewart * to handle things when le->m_head is NULL 1716*69a34e8dSRandall Stewart * and even le->inp is NULL. 1717*69a34e8dSRandall Stewart */ 1718*69a34e8dSRandall Stewart lro_free_mbuf_chain(le->m_head); 1719*69a34e8dSRandall Stewart le->m_head = NULL; 1720*69a34e8dSRandall Stewart } 1721*69a34e8dSRandall Stewart tcp_lro_flush(lc, le); 1722*69a34e8dSRandall Stewart return (TCP_LRO_CANNOT); 1723*69a34e8dSRandall Stewart } 1724*69a34e8dSRandall Stewart /* See if it has been switched on */ 1725*69a34e8dSRandall Stewart if (le->inp && (le->inp->inp_flags2 & INP_MBUF_ACKCMP)) 1726*69a34e8dSRandall Stewart le->flags |= CAN_USE_ACKCMP; 1727*69a34e8dSRandall Stewart 1728*69a34e8dSRandall Stewart if ((need_flush == 1) && 1729*69a34e8dSRandall Stewart le->inp && 1730*69a34e8dSRandall Stewart (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && 1731*69a34e8dSRandall Stewart ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { 1732*69a34e8dSRandall Stewart /* 1733*69a34e8dSRandall Stewart * For MBUF queuing or ACKCMP we can accept ECE and CWR 1734*69a34e8dSRandall Stewart * since each packet is sent to the transport (or the 1735*69a34e8dSRandall Stewart * compressed state including the ECN bits). 1736*69a34e8dSRandall Stewart */ 1737*69a34e8dSRandall Stewart need_flush = 0; 1738*69a34e8dSRandall Stewart } 1739*69a34e8dSRandall Stewart skip_lookup_a: 1740*69a34e8dSRandall Stewart #endif 1741e57b2d0eSRandall Stewart if (need_flush) 1742e57b2d0eSRandall Stewart le->need_wakeup = need_flush; 1743e57b2d0eSRandall Stewart /* Save of the data only csum */ 1744e57b2d0eSRandall Stewart m->m_pkthdr.rcvif = lc->ifp; 1745e57b2d0eSRandall Stewart m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, 174662b5b6ecSBjoern A. Zeeb tcp_data_len, ~csum); 1747e57b2d0eSRandall Stewart th->th_sum = csum; /* Restore checksum */ 1748*69a34e8dSRandall Stewart #ifdef TCPHPTS 1749*69a34e8dSRandall Stewart if ((le->flags & CAN_USE_ACKCMP) || 1750*69a34e8dSRandall Stewart (le->inp && 1751*69a34e8dSRandall Stewart (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { 1752*69a34e8dSRandall Stewart /* 1753*69a34e8dSRandall Stewart * Mbuf queued and ACKCMP packets have their BPF and csum 1754*69a34e8dSRandall Stewart * done here in LRO. They will still end up looking at the 1755*69a34e8dSRandall Stewart * headers and such (IP/TCP) but we don't want to proceed 1756*69a34e8dSRandall Stewart * with any bad csum! 1757*69a34e8dSRandall Stewart */ 1758*69a34e8dSRandall Stewart m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); 1759*69a34e8dSRandall Stewart if (m == NULL) { 1760*69a34e8dSRandall Stewart /* Bad csum, accounting already done */ 1761*69a34e8dSRandall Stewart if (locked) { 1762*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 1763*69a34e8dSRandall Stewart } 1764*69a34e8dSRandall Stewart return (0); 1765*69a34e8dSRandall Stewart } 1766*69a34e8dSRandall Stewart le->strip_cnt++; 1767*69a34e8dSRandall Stewart } 1768*69a34e8dSRandall Stewart if ((need_flush == 0) && 1769*69a34e8dSRandall Stewart (th->th_flags & TH_ACK) && 1770*69a34e8dSRandall Stewart (tcp_data_len == 0) && 1771*69a34e8dSRandall Stewart (le->flags & CAN_USE_ACKCMP)) { 1772*69a34e8dSRandall Stewart /* 1773*69a34e8dSRandall Stewart * Ok this is a pure ack lets find out if our 1774*69a34e8dSRandall Stewart * last packet already has one of these. 1775*69a34e8dSRandall Stewart */ 1776*69a34e8dSRandall Stewart struct mbuf *nm; 1777*69a34e8dSRandall Stewart struct tcp_ackent *ack_ent; 1778*69a34e8dSRandall Stewart int idx; 1779*69a34e8dSRandall Stewart 1780*69a34e8dSRandall Stewart INP_WLOCK_ASSERT(le->inp); 1781*69a34e8dSRandall Stewart if (le->m_head == NULL) { 1782*69a34e8dSRandall Stewart /* Ok can we still use the end of the inp's? */ 1783*69a34e8dSRandall Stewart nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); 1784*69a34e8dSRandall Stewart if (nm == NULL) { 1785*69a34e8dSRandall Stewart /* gone or full */ 1786*69a34e8dSRandall Stewart goto new_one; 1787*69a34e8dSRandall Stewart } 1788*69a34e8dSRandall Stewart /* We can add in to the one on the tail */ 1789*69a34e8dSRandall Stewart ack_ent = mtod(nm, struct tcp_ackent *); 1790*69a34e8dSRandall Stewart idx = (nm->m_len / sizeof(struct tcp_ackent)); 1791*69a34e8dSRandall Stewart build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1792*69a34e8dSRandall Stewart /* Bump the size of both pkt-hdr and len */ 1793*69a34e8dSRandall Stewart nm->m_len += sizeof(struct tcp_ackent); 1794*69a34e8dSRandall Stewart nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1795*69a34e8dSRandall Stewart le->ack_seq = th->th_ack; 1796*69a34e8dSRandall Stewart le->window = th->th_win; 1797*69a34e8dSRandall Stewart m_freem(m); 1798*69a34e8dSRandall Stewart counter_u64_add(tcp_extra_mbuf, 1); 1799*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 1800*69a34e8dSRandall Stewart return (0); 1801*69a34e8dSRandall Stewart } else if (le->m_last_mbuf->m_flags & M_ACKCMP) { 1802*69a34e8dSRandall Stewart /* Yes we might be able to be appended to */ 1803*69a34e8dSRandall Stewart nm = le->m_last_mbuf; 1804*69a34e8dSRandall Stewart if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { 1805*69a34e8dSRandall Stewart if ((le->inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { 1806*69a34e8dSRandall Stewart counter_u64_add(tcp_would_have_but, 1); 1807*69a34e8dSRandall Stewart le->inp->inp_flags2 |= INP_MBUF_L_ACKS; 1808*69a34e8dSRandall Stewart } 1809*69a34e8dSRandall Stewart goto new_one; 1810*69a34e8dSRandall Stewart } 1811*69a34e8dSRandall Stewart /* we have room */ 1812*69a34e8dSRandall Stewart ack_ent = mtod(nm, struct tcp_ackent *); 1813*69a34e8dSRandall Stewart idx = (nm->m_len / sizeof(struct tcp_ackent)); 1814*69a34e8dSRandall Stewart build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1815*69a34e8dSRandall Stewart /* Bump the size of both pkt-hdr and len */ 1816*69a34e8dSRandall Stewart nm->m_len += sizeof(struct tcp_ackent); 1817*69a34e8dSRandall Stewart nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1818*69a34e8dSRandall Stewart m_freem(m); 1819*69a34e8dSRandall Stewart le->flags |= HAS_COMP_ENTRIES; 1820*69a34e8dSRandall Stewart le->cmp_ack_cnt++; 1821*69a34e8dSRandall Stewart goto compressed; 1822*69a34e8dSRandall Stewart } else { 1823*69a34e8dSRandall Stewart /* Nope we need a new one */ 1824*69a34e8dSRandall Stewart new_one: 1825*69a34e8dSRandall Stewart if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) 1826*69a34e8dSRandall Stewart nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); 1827*69a34e8dSRandall Stewart else { 1828*69a34e8dSRandall Stewart nm = m_gethdr(M_NOWAIT, MT_DATA); 1829*69a34e8dSRandall Stewart nm->m_flags |= M_ACKCMP; 1830*69a34e8dSRandall Stewart } 1831*69a34e8dSRandall Stewart if (nm) { 1832*69a34e8dSRandall Stewart nm->m_pkthdr.rcvif = lc->ifp; 1833*69a34e8dSRandall Stewart ack_ent = mtod(nm, struct tcp_ackent *); 1834*69a34e8dSRandall Stewart build_ack_entry(ack_ent, th, m, hdr_len, iptos); 1835*69a34e8dSRandall Stewart m_freem(m); 1836*69a34e8dSRandall Stewart m = nm; 1837*69a34e8dSRandall Stewart m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); 1838*69a34e8dSRandall Stewart le->flags |= HAS_COMP_ENTRIES; 1839*69a34e8dSRandall Stewart le->cmp_ack_cnt++; 1840*69a34e8dSRandall Stewart } 1841*69a34e8dSRandall Stewart /* We fall through and append */ 1842*69a34e8dSRandall Stewart } 1843*69a34e8dSRandall Stewart } 1844*69a34e8dSRandall Stewart if (m->m_flags & M_ACKCMP) { 1845*69a34e8dSRandall Stewart counter_u64_add(tcp_comp_total, 1); 1846*69a34e8dSRandall Stewart } else { 1847*69a34e8dSRandall Stewart counter_u64_add(tcp_uncomp_total, 1); 1848*69a34e8dSRandall Stewart } 1849*69a34e8dSRandall Stewart #endif 1850e57b2d0eSRandall Stewart /* Save off the tail I am appending too (prev) */ 1851*69a34e8dSRandall Stewart m->m_nextpkt = NULL; 1852*69a34e8dSRandall Stewart if (le->m_head == NULL) { 1853*69a34e8dSRandall Stewart /* 1854*69a34e8dSRandall Stewart * Case where we wer chaining off the inp 1855*69a34e8dSRandall Stewart * and now no-longer can. 1856*69a34e8dSRandall Stewart */ 1857*69a34e8dSRandall Stewart le->m_head = m; 1858*69a34e8dSRandall Stewart le->m_tail = m_last(m); 1859*69a34e8dSRandall Stewart le->m_last_mbuf = m; 1860*69a34e8dSRandall Stewart le->m_prev_last = NULL; 1861*69a34e8dSRandall Stewart } else { 1862e57b2d0eSRandall Stewart le->m_prev_last = le->m_last_mbuf; 1863e57b2d0eSRandall Stewart /* Mark me in the last spot */ 1864e57b2d0eSRandall Stewart le->m_last_mbuf->m_nextpkt = m; 1865e57b2d0eSRandall Stewart /* Now set the tail to me */ 1866e57b2d0eSRandall Stewart le->m_last_mbuf = m; 1867e57b2d0eSRandall Stewart le->tcp_tot_p_len += tcp_data_len; 1868*69a34e8dSRandall Stewart } 1869*69a34e8dSRandall Stewart #ifdef TCPHPTS 1870*69a34e8dSRandall Stewart compressed: 1871*69a34e8dSRandall Stewart #endif 1872*69a34e8dSRandall Stewart le->mbuf_cnt++; 1873*69a34e8dSRandall Stewart /* Add to the total size of data */ 1874e57b2d0eSRandall Stewart lro_set_mtime(&le->mtime, &arrv); 1875*69a34e8dSRandall Stewart if (locked) 1876*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 187762b5b6ecSBjoern A. Zeeb return (0); 18786c5087a8SJack F Vogel } 187962b5b6ecSBjoern A. Zeeb /* Try to find an empty slot. */ 18801ea44822SSepherosa Ziehau if (LIST_EMPTY(&lc->lro_free)) 1881489f0c3cSSepherosa Ziehau return (TCP_LRO_NO_ENTRIES); 188262b5b6ecSBjoern A. Zeeb 188362b5b6ecSBjoern A. Zeeb /* Start a new segment chain. */ 18841ea44822SSepherosa Ziehau le = LIST_FIRST(&lc->lro_free); 18851ea44822SSepherosa Ziehau LIST_REMOVE(le, next); 188605cde7efSSepherosa Ziehau tcp_lro_active_insert(lc, bucket, le); 1887e57b2d0eSRandall Stewart lro_set_mtime(&le->mtime, &arrv); 188862b5b6ecSBjoern A. Zeeb 188962b5b6ecSBjoern A. Zeeb /* Start filling in details. */ 189062b5b6ecSBjoern A. Zeeb switch (eh_type) { 189162b5b6ecSBjoern A. Zeeb #ifdef INET6 189262b5b6ecSBjoern A. Zeeb case ETHERTYPE_IPV6: 189362b5b6ecSBjoern A. Zeeb le->le_ip6 = ip6; 189462b5b6ecSBjoern A. Zeeb le->source_ip6 = ip6->ip6_src; 189562b5b6ecSBjoern A. Zeeb le->dest_ip6 = ip6->ip6_dst; 189662b5b6ecSBjoern A. Zeeb le->eh_type = eh_type; 189762b5b6ecSBjoern A. Zeeb le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 189862b5b6ecSBjoern A. Zeeb break; 189962b5b6ecSBjoern A. Zeeb #endif 190062b5b6ecSBjoern A. Zeeb #ifdef INET 190162b5b6ecSBjoern A. Zeeb case ETHERTYPE_IP: 190262b5b6ecSBjoern A. Zeeb le->le_ip4 = ip4; 190362b5b6ecSBjoern A. Zeeb le->source_ip4 = ip4->ip_src.s_addr; 190462b5b6ecSBjoern A. Zeeb le->dest_ip4 = ip4->ip_dst.s_addr; 190562b5b6ecSBjoern A. Zeeb le->eh_type = eh_type; 190662b5b6ecSBjoern A. Zeeb le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 190762b5b6ecSBjoern A. Zeeb break; 190862b5b6ecSBjoern A. Zeeb #endif 19096c5087a8SJack F Vogel } 191062b5b6ecSBjoern A. Zeeb le->source_port = th->th_sport; 191162b5b6ecSBjoern A. Zeeb le->dest_port = th->th_dport; 191262b5b6ecSBjoern A. Zeeb le->next_seq = seq + tcp_data_len; 191362b5b6ecSBjoern A. Zeeb le->ack_seq = th->th_ack; 191462b5b6ecSBjoern A. Zeeb le->window = th->th_win; 1915*69a34e8dSRandall Stewart if (hdr_len != 0) { 191662b5b6ecSBjoern A. Zeeb le->timestamp = 1; 191762b5b6ecSBjoern A. Zeeb le->tsval = ntohl(*(ts_ptr + 1)); 191862b5b6ecSBjoern A. Zeeb le->tsecr = *(ts_ptr + 2); 191962b5b6ecSBjoern A. Zeeb } 192062b5b6ecSBjoern A. Zeeb KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", 192162b5b6ecSBjoern A. Zeeb __func__, le, le->ulp_csum)); 192262b5b6ecSBjoern A. Zeeb 1923e57b2d0eSRandall Stewart le->append_cnt = 0; 192462b5b6ecSBjoern A. Zeeb le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 192562b5b6ecSBjoern A. Zeeb ~csum); 1926e57b2d0eSRandall Stewart le->append_cnt++; 1927e57b2d0eSRandall Stewart th->th_sum = csum; /* Restore checksum */ 1928e57b2d0eSRandall Stewart m->m_pkthdr.rcvif = lc->ifp; 1929*69a34e8dSRandall Stewart m->m_pkthdr.lro_len = tcp_data_len; 1930e57b2d0eSRandall Stewart le->mbuf_cnt = 1; 1931*69a34e8dSRandall Stewart le->cmp_ack_cnt = 0; 1932*69a34e8dSRandall Stewart le->flags = 0; 1933*69a34e8dSRandall Stewart #ifdef TCPHPTS 1934*69a34e8dSRandall Stewart /* 1935*69a34e8dSRandall Stewart * Lets find out if we can use the mbuf-compression. 1936*69a34e8dSRandall Stewart */ 1937*69a34e8dSRandall Stewart if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) 1938*69a34e8dSRandall Stewart goto skip_lookup_b; 1939*69a34e8dSRandall Stewart CURVNET_SET(lc->ifp->if_vnet); 1940*69a34e8dSRandall Stewart le->inp = tcp_lro_lookup(lc, le); 1941*69a34e8dSRandall Stewart if (le->inp && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1942*69a34e8dSRandall Stewart (le->inp->inp_flags2 & INP_FREED))) { 1943*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 1944*69a34e8dSRandall Stewart le->inp = NULL; 1945*69a34e8dSRandall Stewart } 1946*69a34e8dSRandall Stewart if (le->inp) { 1947*69a34e8dSRandall Stewart if ((need_flush == 1) && 1948*69a34e8dSRandall Stewart (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && 1949*69a34e8dSRandall Stewart ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { 1950*69a34e8dSRandall Stewart /* 1951*69a34e8dSRandall Stewart * For MBUF queuing or ACKCMP we can accept ECE and CWR 1952*69a34e8dSRandall Stewart * since each packet is sent to the transport (or the 1953*69a34e8dSRandall Stewart * compressed state including the ECN bits). 1954*69a34e8dSRandall Stewart */ 1955*69a34e8dSRandall Stewart need_flush = 0; 1956*69a34e8dSRandall Stewart } 1957*69a34e8dSRandall Stewart locked = 1; 1958*69a34e8dSRandall Stewart if (le->inp->inp_flags2 & INP_MBUF_ACKCMP) 1959*69a34e8dSRandall Stewart le->flags |= CAN_USE_ACKCMP; 1960*69a34e8dSRandall Stewart if ((le->flags & CAN_USE_ACKCMP) || 1961*69a34e8dSRandall Stewart (le->inp && 1962*69a34e8dSRandall Stewart (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { 1963*69a34e8dSRandall Stewart m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); 1964*69a34e8dSRandall Stewart if (m == NULL) { 1965*69a34e8dSRandall Stewart /* Bad csum, accounting already done */ 1966*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 1967*69a34e8dSRandall Stewart le->inp = NULL; 1968*69a34e8dSRandall Stewart return (0); 1969*69a34e8dSRandall Stewart } 1970*69a34e8dSRandall Stewart le->strip_cnt++; 1971*69a34e8dSRandall Stewart } 1972*69a34e8dSRandall Stewart in_pcbref(le->inp); 1973*69a34e8dSRandall Stewart } 1974*69a34e8dSRandall Stewart CURVNET_RESTORE(); 1975*69a34e8dSRandall Stewart if ((need_flush == 0) && 1976*69a34e8dSRandall Stewart (th->th_flags & TH_ACK) && 1977*69a34e8dSRandall Stewart (tcp_data_len == 0) && 1978*69a34e8dSRandall Stewart (le->flags & CAN_USE_ACKCMP)) { 1979*69a34e8dSRandall Stewart /* Ok this is a pure ack lets build our special COMPRESS mbuf */ 1980*69a34e8dSRandall Stewart struct mbuf *nm; 1981*69a34e8dSRandall Stewart struct tcp_ackent *ack_ent; 1982*69a34e8dSRandall Stewart 1983*69a34e8dSRandall Stewart /* Question what is going on with the last mbuf on the inp queue, can we use it? */ 1984*69a34e8dSRandall Stewart INP_WLOCK_ASSERT(le->inp); 1985*69a34e8dSRandall Stewart nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); 1986*69a34e8dSRandall Stewart if (nm) { 1987*69a34e8dSRandall Stewart int idx; 1988*69a34e8dSRandall Stewart 1989*69a34e8dSRandall Stewart /* We can add in to the one on the tail */ 1990*69a34e8dSRandall Stewart ack_ent = mtod(nm, struct tcp_ackent *); 1991*69a34e8dSRandall Stewart idx = (nm->m_len / sizeof(struct tcp_ackent)); 1992*69a34e8dSRandall Stewart build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1993*69a34e8dSRandall Stewart nm->m_len += sizeof(struct tcp_ackent); 1994*69a34e8dSRandall Stewart nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1995*69a34e8dSRandall Stewart le->ack_seq = th->th_ack; 1996*69a34e8dSRandall Stewart le->window = th->th_win; 1997*69a34e8dSRandall Stewart m_freem(m); 1998*69a34e8dSRandall Stewart counter_u64_add(tcp_extra_mbuf, 1); 1999*69a34e8dSRandall Stewart le->m_head = NULL; 2000*69a34e8dSRandall Stewart le->m_tail = NULL; 2001*69a34e8dSRandall Stewart le->m_last_mbuf = NULL; 2002*69a34e8dSRandall Stewart le->m_prev_last = NULL; 2003*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 2004*69a34e8dSRandall Stewart return (0); 2005*69a34e8dSRandall Stewart } else { 2006*69a34e8dSRandall Stewart if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) 2007*69a34e8dSRandall Stewart nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); 2008*69a34e8dSRandall Stewart else { 2009*69a34e8dSRandall Stewart nm = m_gethdr(M_NOWAIT, MT_DATA); 2010*69a34e8dSRandall Stewart nm->m_flags |= M_ACKCMP; 2011*69a34e8dSRandall Stewart } 2012*69a34e8dSRandall Stewart if (nm) { 2013*69a34e8dSRandall Stewart nm->m_pkthdr.rcvif = lc->ifp; 2014*69a34e8dSRandall Stewart ack_ent = mtod(nm, struct tcp_ackent *); 2015*69a34e8dSRandall Stewart build_ack_entry(ack_ent, th, m, hdr_len, iptos); 2016*69a34e8dSRandall Stewart m_freem(m); 2017*69a34e8dSRandall Stewart m = nm; 2018*69a34e8dSRandall Stewart m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); 2019*69a34e8dSRandall Stewart le->flags |= HAS_COMP_ENTRIES; 2020*69a34e8dSRandall Stewart le->cmp_ack_cnt++; 2021*69a34e8dSRandall Stewart } 2022*69a34e8dSRandall Stewart } 2023*69a34e8dSRandall Stewart } 2024*69a34e8dSRandall Stewart if (m->m_flags & M_ACKCMP) { 2025*69a34e8dSRandall Stewart counter_u64_add(tcp_comp_total, 1); 2026*69a34e8dSRandall Stewart } else { 2027*69a34e8dSRandall Stewart counter_u64_add(tcp_uncomp_total, 1); 2028*69a34e8dSRandall Stewart } 2029*69a34e8dSRandall Stewart skip_lookup_b: 2030*69a34e8dSRandall Stewart #endif 2031e57b2d0eSRandall Stewart if (need_flush) 2032e57b2d0eSRandall Stewart le->need_wakeup = need_flush; 2033e57b2d0eSRandall Stewart else 2034e57b2d0eSRandall Stewart le->need_wakeup = 0; 2035*69a34e8dSRandall Stewart m->m_nextpkt = NULL; 2036*69a34e8dSRandall Stewart le->m_head = m; 203762b5b6ecSBjoern A. Zeeb le->m_tail = m_last(m); 2038e57b2d0eSRandall Stewart le->m_last_mbuf = m; 2039e57b2d0eSRandall Stewart le->m_prev_last = NULL; 2040e57b2d0eSRandall Stewart /* 2041e57b2d0eSRandall Stewart * We keep the total size here for cross checking when we may need 2042e57b2d0eSRandall Stewart * to flush/wakeup in the MBUF_QUEUE case. 2043e57b2d0eSRandall Stewart */ 2044e57b2d0eSRandall Stewart le->tcp_tot_p_len = tcp_data_len; 2045*69a34e8dSRandall Stewart if (locked) 2046*69a34e8dSRandall Stewart INP_WUNLOCK(le->inp); 204762b5b6ecSBjoern A. Zeeb return (0); 204862b5b6ecSBjoern A. Zeeb } 204962b5b6ecSBjoern A. Zeeb 205005cde7efSSepherosa Ziehau int 205105cde7efSSepherosa Ziehau tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 205205cde7efSSepherosa Ziehau { 205305cde7efSSepherosa Ziehau 205405cde7efSSepherosa Ziehau return tcp_lro_rx2(lc, m, csum, 1); 205505cde7efSSepherosa Ziehau } 205605cde7efSSepherosa Ziehau 2057e936121dSHans Petter Selasky void 2058e936121dSHans Petter Selasky tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) 2059e936121dSHans Petter Selasky { 2060e57b2d0eSRandall Stewart struct timespec arrv; 2061e57b2d0eSRandall Stewart 2062e936121dSHans Petter Selasky /* sanity checks */ 2063e936121dSHans Petter Selasky if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || 2064e936121dSHans Petter Selasky lc->lro_mbuf_max == 0)) { 2065e936121dSHans Petter Selasky /* packet drop */ 2066e936121dSHans Petter Selasky m_freem(mb); 2067e936121dSHans Petter Selasky return; 2068e936121dSHans Petter Selasky } 2069e936121dSHans Petter Selasky 2070e936121dSHans Petter Selasky /* check if packet is not LRO capable */ 2071e936121dSHans Petter Selasky if (__predict_false(mb->m_pkthdr.csum_flags == 0 || 2072e936121dSHans Petter Selasky (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { 2073e936121dSHans Petter Selasky /* input packet to network layer */ 2074e936121dSHans Petter Selasky (*lc->ifp->if_input) (lc->ifp, mb); 2075e936121dSHans Petter Selasky return; 2076e936121dSHans Petter Selasky } 2077e57b2d0eSRandall Stewart /* Arrival Stamp the packet */ 2078e936121dSHans Petter Selasky 2079e57b2d0eSRandall Stewart if ((mb->m_flags & M_TSTMP) == 0) { 2080e57b2d0eSRandall Stewart /* If no hardware or arrival stamp on the packet add arrival */ 2081e57b2d0eSRandall Stewart nanouptime(&arrv); 2082e57b2d0eSRandall Stewart mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) + 2083e57b2d0eSRandall Stewart arrv.tv_nsec); 2084e57b2d0eSRandall Stewart mb->m_flags |= M_TSTMP_LRO; 2085e57b2d0eSRandall Stewart } 2086fc271df3SHans Petter Selasky /* create sequence number */ 2087fc271df3SHans Petter Selasky lc->lro_mbuf_data[lc->lro_mbuf_count].seq = 2088fc271df3SHans Petter Selasky (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | 2089fc271df3SHans Petter Selasky (((uint64_t)mb->m_pkthdr.flowid) << 24) | 2090fc271df3SHans Petter Selasky ((uint64_t)lc->lro_mbuf_count); 2091e936121dSHans Petter Selasky 2092e936121dSHans Petter Selasky /* enter mbuf */ 2093f8acc03eSNavdeep Parhar lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; 2094f8acc03eSNavdeep Parhar 2095f8acc03eSNavdeep Parhar /* flush if array is full */ 2096f8acc03eSNavdeep Parhar if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) 2097f8acc03eSNavdeep Parhar tcp_lro_flush_all(lc); 2098e936121dSHans Petter Selasky } 2099e936121dSHans Petter Selasky 210062b5b6ecSBjoern A. Zeeb /* end */ 2101