1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2007, Myricom Inc. 5 * Copyright (c) 2008, Intel Corporation. 6 * Copyright (c) 2012 The FreeBSD Foundation 7 * Copyright (c) 2016-2021 Mellanox Technologies. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Bjoern Zeeb 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/malloc.h> 42 #include <sys/mbuf.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/sockbuf.h> 46 #include <sys/sysctl.h> 47 #include <sys/hash.h> 48 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/ethernet.h> 52 #include <net/bpf.h> 53 #include <net/vnet.h> 54 #include <net/if_dl.h> 55 #include <net/if_media.h> 56 #include <net/if_private.h> 57 #include <net/if_types.h> 58 #include <net/infiniband.h> 59 #include <net/if_lagg.h> 60 61 #include <netinet/in_systm.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/ip.h> 65 #include <netinet/ip_var.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet6/in6_pcb.h> 68 #include <netinet/tcp.h> 69 #include <netinet/tcp_seq.h> 70 #include <netinet/tcp_lro.h> 71 #include <netinet/tcp_var.h> 72 #include <netinet/tcpip.h> 73 #include <netinet/tcp_hpts.h> 74 #include <netinet/tcp_log_buf.h> 75 #include <netinet/tcp_fsm.h> 76 #include <netinet/udp.h> 77 #include <netinet6/ip6_var.h> 78 79 #include <machine/in_cksum.h> 80 81 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); 82 83 static void tcp_lro_rx_done(struct lro_ctrl *lc); 84 static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, 85 uint32_t csum, bool use_hash); 86 static void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le); 87 88 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 89 "TCP LRO"); 90 91 long tcplro_stacks_wanting_mbufq; 92 int (*tcp_lro_flush_tcphpts)(struct lro_ctrl *lc, struct lro_entry *le); 93 void (*tcp_hpts_softclock)(void); 94 95 counter_u64_t tcp_inp_lro_direct_queue; 96 counter_u64_t tcp_inp_lro_wokeup_queue; 97 counter_u64_t tcp_inp_lro_compressed; 98 counter_u64_t tcp_inp_lro_locks_taken; 99 counter_u64_t tcp_extra_mbuf; 100 counter_u64_t tcp_would_have_but; 101 counter_u64_t tcp_comp_total; 102 counter_u64_t tcp_uncomp_total; 103 counter_u64_t tcp_bad_csums; 104 105 static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; 106 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, 107 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, 108 "default number of LRO entries"); 109 110 static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH; 111 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold, 112 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0, 113 "Number of interrupts in a row on the same CPU that will make us declare an 'affinity' cpu?"); 114 115 static uint32_t tcp_less_accurate_lro_ts = 0; 116 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_less_accurate, 117 CTLFLAG_MPSAFE, &tcp_less_accurate_lro_ts, 0, 118 "Do we trade off efficency by doing less timestamp operations for time accuracy?"); 119 120 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, 121 &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); 122 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, 123 &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); 124 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, 125 &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); 126 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, 127 &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); 128 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, 129 &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); 130 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, 131 &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed"); 132 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, 133 &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); 134 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, 135 &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); 136 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lro_badcsum, CTLFLAG_RD, 137 &tcp_bad_csums, "Number of packets that the common code saw with bad csums"); 138 139 void 140 tcp_lro_reg_mbufq(void) 141 { 142 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); 143 } 144 145 void 146 tcp_lro_dereg_mbufq(void) 147 { 148 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); 149 } 150 151 static __inline void 152 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, 153 struct lro_entry *le) 154 { 155 156 LIST_INSERT_HEAD(&lc->lro_active, le, next); 157 LIST_INSERT_HEAD(bucket, le, hash_next); 158 } 159 160 static __inline void 161 tcp_lro_active_remove(struct lro_entry *le) 162 { 163 164 LIST_REMOVE(le, next); /* active list */ 165 LIST_REMOVE(le, hash_next); /* hash bucket */ 166 } 167 168 int 169 tcp_lro_init(struct lro_ctrl *lc) 170 { 171 return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); 172 } 173 174 int 175 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, 176 unsigned lro_entries, unsigned lro_mbufs) 177 { 178 struct lro_entry *le; 179 size_t size; 180 unsigned i; 181 182 lc->lro_bad_csum = 0; 183 lc->lro_queued = 0; 184 lc->lro_flushed = 0; 185 lc->lro_mbuf_count = 0; 186 lc->lro_mbuf_max = lro_mbufs; 187 lc->lro_cnt = lro_entries; 188 lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; 189 lc->lro_length_lim = TCP_LRO_LENGTH_MAX; 190 lc->ifp = ifp; 191 LIST_INIT(&lc->lro_free); 192 LIST_INIT(&lc->lro_active); 193 194 /* Create hash table to accelerate entry lookup. */ 195 struct hashalloc_args ha = { 196 .size = lro_entries, 197 .mtype = M_LRO, 198 .mflags = M_NOWAIT, 199 .type = HASH_TYPE_PRIME, 200 }; 201 lc->lro_hash = hashalloc(&ha); 202 if (lc->lro_hash == NULL) { 203 memset(lc, 0, sizeof(*lc)); 204 return (ENOMEM); 205 } 206 lc->lro_hashsz = ha.size; 207 208 /* compute size to allocate */ 209 size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + 210 (lro_entries * sizeof(*le)); 211 lc->lro_mbuf_data = (struct lro_mbuf_sort *) 212 malloc(size, M_LRO, M_NOWAIT | M_ZERO); 213 214 /* check for out of memory */ 215 if (lc->lro_mbuf_data == NULL) { 216 struct hashalloc_args ha = { 217 .size = lc->lro_hashsz, 218 .mtype = M_LRO, 219 }; 220 hashfree(lc->lro_hash, &ha); 221 memset(lc, 0, sizeof(*lc)); 222 return (ENOMEM); 223 } 224 /* compute offset for LRO entries */ 225 le = (struct lro_entry *) 226 (lc->lro_mbuf_data + lro_mbufs); 227 228 /* setup linked list */ 229 for (i = 0; i != lro_entries; i++) 230 LIST_INSERT_HEAD(&lc->lro_free, le + i, next); 231 232 return (0); 233 } 234 235 struct vxlan_header { 236 uint32_t vxlh_flags; 237 uint32_t vxlh_vni; 238 }; 239 240 static inline void * 241 tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan, int mlen) 242 { 243 const struct ether_vlan_header *eh; 244 void *old; 245 uint16_t eth_type; 246 247 if (update_data) 248 memset(parser, 0, sizeof(*parser)); 249 250 old = ptr; 251 252 if (is_vxlan) { 253 const struct vxlan_header *vxh; 254 vxh = ptr; 255 ptr = (uint8_t *)ptr + sizeof(*vxh); 256 if (update_data) { 257 parser->data.vxlan_vni = 258 vxh->vxlh_vni & htonl(0xffffff00); 259 } 260 } 261 262 eh = ptr; 263 if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) { 264 eth_type = eh->evl_proto; 265 if (update_data) { 266 /* strip priority and keep VLAN ID only */ 267 parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK); 268 } 269 /* advance to next header */ 270 ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 271 mlen -= (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 272 } else { 273 eth_type = eh->evl_encap_proto; 274 /* advance to next header */ 275 mlen -= ETHER_HDR_LEN; 276 ptr = (uint8_t *)ptr + ETHER_HDR_LEN; 277 } 278 if (__predict_false(mlen <= 0)) 279 return (NULL); 280 switch (eth_type) { 281 #ifdef INET 282 case htons(ETHERTYPE_IP): 283 parser->ip4 = ptr; 284 if (__predict_false(mlen < sizeof(struct ip))) 285 return (NULL); 286 /* Ensure there are no IPv4 options. */ 287 if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4)) 288 break; 289 /* .. and the packet is not fragmented. */ 290 if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 291 break; 292 /* .. and the packet has valid src/dst addrs */ 293 if (__predict_false(parser->ip4->ip_src.s_addr == INADDR_ANY || 294 parser->ip4->ip_dst.s_addr == INADDR_ANY)) 295 break; 296 ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2); 297 mlen -= sizeof(struct ip); 298 if (update_data) { 299 parser->data.s_addr.v4 = parser->ip4->ip_src; 300 parser->data.d_addr.v4 = parser->ip4->ip_dst; 301 } 302 switch (parser->ip4->ip_p) { 303 case IPPROTO_UDP: 304 if (__predict_false(mlen < sizeof(struct udphdr))) 305 return (NULL); 306 parser->udp = ptr; 307 if (update_data) { 308 parser->data.lro_type = LRO_TYPE_IPV4_UDP; 309 parser->data.s_port = parser->udp->uh_sport; 310 parser->data.d_port = parser->udp->uh_dport; 311 } else { 312 MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP); 313 } 314 ptr = ((uint8_t *)ptr + sizeof(*parser->udp)); 315 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 316 return (ptr); 317 case IPPROTO_TCP: 318 parser->tcp = ptr; 319 if (__predict_false(mlen < sizeof(struct tcphdr))) 320 return (NULL); 321 if (update_data) { 322 parser->data.lro_type = LRO_TYPE_IPV4_TCP; 323 parser->data.s_port = parser->tcp->th_sport; 324 parser->data.d_port = parser->tcp->th_dport; 325 } else { 326 MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP); 327 } 328 if (__predict_false(mlen < (parser->tcp->th_off << 2))) 329 return (NULL); 330 ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); 331 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 332 return (ptr); 333 default: 334 break; 335 } 336 break; 337 #endif 338 #ifdef INET6 339 case htons(ETHERTYPE_IPV6): 340 parser->ip6 = ptr; 341 if (__predict_false(mlen < sizeof(struct ip6_hdr))) 342 return (NULL); 343 /* Ensure the packet has valid src/dst addrs */ 344 if (__predict_false(IN6_IS_ADDR_UNSPECIFIED(&parser->ip6->ip6_src) || 345 IN6_IS_ADDR_UNSPECIFIED(&parser->ip6->ip6_dst))) 346 return (NULL); 347 ptr = (uint8_t *)ptr + sizeof(*parser->ip6); 348 if (update_data) { 349 parser->data.s_addr.v6 = parser->ip6->ip6_src; 350 parser->data.d_addr.v6 = parser->ip6->ip6_dst; 351 } 352 mlen -= sizeof(struct ip6_hdr); 353 switch (parser->ip6->ip6_nxt) { 354 case IPPROTO_UDP: 355 if (__predict_false(mlen < sizeof(struct udphdr))) 356 return (NULL); 357 parser->udp = ptr; 358 if (update_data) { 359 parser->data.lro_type = LRO_TYPE_IPV6_UDP; 360 parser->data.s_port = parser->udp->uh_sport; 361 parser->data.d_port = parser->udp->uh_dport; 362 } else { 363 MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP); 364 } 365 ptr = (uint8_t *)ptr + sizeof(*parser->udp); 366 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 367 return (ptr); 368 case IPPROTO_TCP: 369 if (__predict_false(mlen < sizeof(struct tcphdr))) 370 return (NULL); 371 parser->tcp = ptr; 372 if (update_data) { 373 parser->data.lro_type = LRO_TYPE_IPV6_TCP; 374 parser->data.s_port = parser->tcp->th_sport; 375 parser->data.d_port = parser->tcp->th_dport; 376 } else { 377 MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP); 378 } 379 if (__predict_false(mlen < (parser->tcp->th_off << 2))) 380 return (NULL); 381 ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); 382 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 383 return (ptr); 384 default: 385 break; 386 } 387 break; 388 #endif 389 default: 390 break; 391 } 392 /* Invalid packet - cannot parse */ 393 return (NULL); 394 } 395 396 static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | 397 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID; 398 399 static inline struct lro_parser * 400 tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data) 401 { 402 void *data_ptr; 403 404 /* Try to parse outer headers first. */ 405 data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false, m->m_len); 406 if (data_ptr == NULL || po->total_hdr_len > m->m_len) 407 return (NULL); 408 409 if (update_data) { 410 /* Store VLAN ID, if any. */ 411 if (__predict_false(m->m_flags & M_VLANTAG)) { 412 po->data.vlan_id = 413 htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK); 414 } 415 /* Store decrypted flag, if any. */ 416 if (__predict_false((m->m_pkthdr.csum_flags & 417 CSUM_TLS_MASK) == CSUM_TLS_DECRYPTED)) 418 po->data.lro_flags |= LRO_FLAG_DECRYPTED; 419 } 420 421 switch (po->data.lro_type) { 422 case LRO_TYPE_IPV4_UDP: 423 case LRO_TYPE_IPV6_UDP: 424 /* Check for VXLAN headers. */ 425 if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum) 426 break; 427 428 /* Try to parse inner headers. */ 429 data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true, 430 (m->m_len - ((caddr_t)data_ptr - m->m_data))); 431 if (data_ptr == NULL || (pi->total_hdr_len + po->total_hdr_len) > m->m_len) 432 break; 433 434 /* Verify supported header types. */ 435 switch (pi->data.lro_type) { 436 case LRO_TYPE_IPV4_TCP: 437 case LRO_TYPE_IPV6_TCP: 438 return (pi); 439 default: 440 break; 441 } 442 break; 443 case LRO_TYPE_IPV4_TCP: 444 case LRO_TYPE_IPV6_TCP: 445 if (update_data) 446 memset(pi, 0, sizeof(*pi)); 447 return (po); 448 default: 449 break; 450 } 451 return (NULL); 452 } 453 454 static inline int 455 tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po) 456 { 457 int len; 458 459 switch (po->data.lro_type) { 460 #ifdef INET 461 case LRO_TYPE_IPV4_TCP: 462 len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) + 463 ntohs(po->ip4->ip_len); 464 break; 465 #endif 466 #ifdef INET6 467 case LRO_TYPE_IPV6_TCP: 468 len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) + 469 ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6); 470 break; 471 #endif 472 default: 473 return (TCP_LRO_CANNOT); 474 } 475 476 /* 477 * If the frame is padded beyond the end of the IP packet, 478 * then trim the extra bytes off: 479 */ 480 if (__predict_true(m->m_pkthdr.len == len)) { 481 return (0); 482 } else if (m->m_pkthdr.len > len) { 483 m_adj(m, len - m->m_pkthdr.len); 484 return (0); 485 } 486 return (TCP_LRO_CANNOT); 487 } 488 489 static void 490 lro_free_mbuf_chain(struct mbuf *m) 491 { 492 struct mbuf *save; 493 494 while (m) { 495 save = m->m_nextpkt; 496 m->m_nextpkt = NULL; 497 m_freem(m); 498 m = save; 499 } 500 } 501 502 void 503 tcp_lro_free(struct lro_ctrl *lc) 504 { 505 struct lro_entry *le; 506 unsigned x; 507 508 /* reset LRO free list */ 509 LIST_INIT(&lc->lro_free); 510 511 /* free active mbufs, if any */ 512 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 513 tcp_lro_active_remove(le); 514 lro_free_mbuf_chain(le->m_head); 515 } 516 517 struct hashalloc_args ha = { 518 .size = lc->lro_hashsz, 519 .mtype = M_LRO, 520 }; 521 hashfree(lc->lro_hash, &ha); 522 lc->lro_hash = NULL; 523 lc->lro_hashsz = 0; 524 525 /* free mbuf array, if any */ 526 for (x = 0; x != lc->lro_mbuf_count; x++) 527 m_freem(lc->lro_mbuf_data[x].mb); 528 lc->lro_mbuf_count = 0; 529 530 /* free allocated memory, if any */ 531 free(lc->lro_mbuf_data, M_LRO); 532 lc->lro_mbuf_data = NULL; 533 } 534 535 static uint16_t 536 tcp_lro_rx_csum_tcphdr(const struct tcphdr *th) 537 { 538 const uint16_t *ptr; 539 uint32_t csum; 540 uint16_t len; 541 542 csum = -th->th_sum; /* exclude checksum field */ 543 len = th->th_off; 544 ptr = (const uint16_t *)th; 545 while (len--) { 546 csum += *ptr; 547 ptr++; 548 csum += *ptr; 549 ptr++; 550 } 551 while (csum > 0xffff) 552 csum = (csum >> 16) + (csum & 0xffff); 553 554 return (csum); 555 } 556 557 static uint16_t 558 tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum) 559 { 560 uint32_t c; 561 uint16_t cs; 562 563 c = tcp_csum; 564 565 switch (pa->data.lro_type) { 566 #ifdef INET6 567 case LRO_TYPE_IPV6_TCP: 568 /* Compute full pseudo IPv6 header checksum. */ 569 cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0); 570 break; 571 #endif 572 #ifdef INET 573 case LRO_TYPE_IPV4_TCP: 574 /* Compute full pseudo IPv4 header checsum. */ 575 cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP); 576 cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs)); 577 break; 578 #endif 579 default: 580 cs = 0; /* Keep compiler happy. */ 581 break; 582 } 583 584 /* Complement checksum. */ 585 cs = ~cs; 586 c += cs; 587 588 /* Remove TCP header checksum. */ 589 cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp); 590 c += cs; 591 592 /* Compute checksum remainder. */ 593 while (c > 0xffff) 594 c = (c >> 16) + (c & 0xffff); 595 596 return (c); 597 } 598 599 static void 600 tcp_lro_rx_done(struct lro_ctrl *lc) 601 { 602 struct lro_entry *le; 603 604 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 605 tcp_lro_active_remove(le); 606 tcp_lro_flush(lc, le); 607 } 608 } 609 610 static void 611 tcp_lro_flush_active(struct lro_ctrl *lc) 612 { 613 struct lro_entry *le, *le_tmp; 614 615 /* 616 * Walk through the list of le entries, and 617 * any one that does have packets flush. This 618 * is called because we have an inbound packet 619 * (e.g. SYN) that has to have all others flushed 620 * in front of it. Note we have to do the remove 621 * because tcp_lro_flush() assumes that the entry 622 * is being freed. This is ok it will just get 623 * reallocated again like it was new. 624 */ 625 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 626 if (le->m_head != NULL) { 627 tcp_lro_active_remove(le); 628 tcp_lro_flush(lc, le); 629 } 630 } 631 } 632 633 void 634 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 635 { 636 struct lro_entry *le, *le_tmp; 637 uint64_t now, tov; 638 struct bintime bt; 639 640 NET_EPOCH_ASSERT(); 641 if (LIST_EMPTY(&lc->lro_active)) 642 return; 643 644 /* get timeout time and current time in ns */ 645 binuptime(&bt); 646 now = bintime2ns(&bt); 647 tov = ((timeout->tv_sec * 1000000000) + (timeout->tv_usec * 1000)); 648 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 649 if (now >= (bintime2ns(&le->alloc_time) + tov)) { 650 tcp_lro_active_remove(le); 651 tcp_lro_flush(lc, le); 652 } 653 } 654 } 655 656 #ifdef INET 657 static int 658 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4) 659 { 660 uint16_t csum; 661 662 /* Legacy IP has a header checksum that needs to be correct. */ 663 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 664 if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) { 665 lc->lro_bad_csum++; 666 return (TCP_LRO_CANNOT); 667 } 668 } else { 669 csum = in_cksum_hdr(ip4); 670 if (__predict_false(csum != 0)) { 671 lc->lro_bad_csum++; 672 return (TCP_LRO_CANNOT); 673 } 674 } 675 return (0); 676 } 677 #endif 678 679 static inline void 680 tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum) 681 { 682 uint32_t csum; 683 684 csum = 0xffff - *ptr + value; 685 while (csum > 0xffff) 686 csum = (csum >> 16) + (csum & 0xffff); 687 *ptr = value; 688 *psum = csum; 689 } 690 691 static uint16_t 692 tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le, 693 uint16_t payload_len, uint16_t delta_sum) 694 { 695 uint32_t csum; 696 uint16_t tlen; 697 uint16_t temp[5] = {}; 698 699 switch (pa->data.lro_type) { 700 case LRO_TYPE_IPV4_TCP: 701 /* Compute new IPv4 length. */ 702 tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len; 703 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); 704 705 /* Subtract delta from current IPv4 checksum. */ 706 csum = pa->ip4->ip_sum + 0xffff - temp[0]; 707 while (csum > 0xffff) 708 csum = (csum >> 16) + (csum & 0xffff); 709 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); 710 goto update_tcp_header; 711 712 case LRO_TYPE_IPV6_TCP: 713 /* Compute new IPv6 length. */ 714 tlen = (pa->tcp->th_off << 2) + payload_len; 715 tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); 716 goto update_tcp_header; 717 718 case LRO_TYPE_IPV4_UDP: 719 /* Compute new IPv4 length. */ 720 tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len; 721 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); 722 723 /* Subtract delta from current IPv4 checksum. */ 724 csum = pa->ip4->ip_sum + 0xffff - temp[0]; 725 while (csum > 0xffff) 726 csum = (csum >> 16) + (csum & 0xffff); 727 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); 728 goto update_udp_header; 729 730 case LRO_TYPE_IPV6_UDP: 731 /* Compute new IPv6 length. */ 732 tlen = sizeof(*pa->udp) + payload_len; 733 tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); 734 goto update_udp_header; 735 736 default: 737 return (0); 738 } 739 740 update_tcp_header: 741 /* Compute current TCP header checksum. */ 742 temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp); 743 744 /* Incorporate the latest ACK into the TCP header. */ 745 pa->tcp->th_ack = le->ack_seq; 746 pa->tcp->th_win = le->window; 747 748 /* Incorporate latest timestamp into the TCP header. */ 749 if (le->timestamp != 0) { 750 uint32_t *ts_ptr; 751 752 ts_ptr = (uint32_t *)(pa->tcp + 1); 753 ts_ptr[1] = htonl(le->tsval); 754 ts_ptr[2] = le->tsecr; 755 } 756 757 /* Compute new TCP header checksum. */ 758 temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp); 759 760 /* Compute new TCP checksum. */ 761 csum = pa->tcp->th_sum + 0xffff - delta_sum + 762 0xffff - temp[0] + 0xffff - temp[3] + temp[2]; 763 while (csum > 0xffff) 764 csum = (csum >> 16) + (csum & 0xffff); 765 766 /* Assign new TCP checksum. */ 767 tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]); 768 769 /* Compute all modififications affecting next checksum. */ 770 csum = temp[0] + temp[1] + 0xffff - temp[2] + 771 temp[3] + temp[4] + delta_sum; 772 while (csum > 0xffff) 773 csum = (csum >> 16) + (csum & 0xffff); 774 775 /* Return delta checksum to next stage, if any. */ 776 return (csum); 777 778 update_udp_header: 779 tlen = sizeof(*pa->udp) + payload_len; 780 /* Assign new UDP length and compute checksum delta. */ 781 tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]); 782 783 /* Check if there is a UDP checksum. */ 784 if (__predict_false(pa->udp->uh_sum != 0)) { 785 /* Compute new UDP checksum. */ 786 csum = pa->udp->uh_sum + 0xffff - delta_sum + 787 0xffff - temp[0] + 0xffff - temp[2]; 788 while (csum > 0xffff) 789 csum = (csum >> 16) + (csum & 0xffff); 790 /* Assign new UDP checksum. */ 791 tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]); 792 } 793 794 /* Compute all modififications affecting next checksum. */ 795 csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum; 796 while (csum > 0xffff) 797 csum = (csum >> 16) + (csum & 0xffff); 798 799 /* Return delta checksum to next stage, if any. */ 800 return (csum); 801 } 802 803 static void 804 tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le) 805 { 806 /* Check if we need to recompute any checksums. */ 807 if (le->needs_merge) { 808 uint16_t csum; 809 810 switch (le->inner.data.lro_type) { 811 case LRO_TYPE_IPV4_TCP: 812 csum = tcp_lro_update_checksum(&le->inner, le, 813 le->m_head->m_pkthdr.lro_tcp_d_len, 814 le->m_head->m_pkthdr.lro_tcp_d_csum); 815 csum = tcp_lro_update_checksum(&le->outer, NULL, 816 le->m_head->m_pkthdr.lro_tcp_d_len + 817 le->inner.total_hdr_len, csum); 818 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 819 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 820 le->m_head->m_pkthdr.csum_data = 0xffff; 821 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 822 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 823 break; 824 case LRO_TYPE_IPV6_TCP: 825 csum = tcp_lro_update_checksum(&le->inner, le, 826 le->m_head->m_pkthdr.lro_tcp_d_len, 827 le->m_head->m_pkthdr.lro_tcp_d_csum); 828 csum = tcp_lro_update_checksum(&le->outer, NULL, 829 le->m_head->m_pkthdr.lro_tcp_d_len + 830 le->inner.total_hdr_len, csum); 831 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 832 CSUM_PSEUDO_HDR; 833 le->m_head->m_pkthdr.csum_data = 0xffff; 834 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 835 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 836 break; 837 case LRO_TYPE_NONE: 838 switch (le->outer.data.lro_type) { 839 case LRO_TYPE_IPV4_TCP: 840 csum = tcp_lro_update_checksum(&le->outer, le, 841 le->m_head->m_pkthdr.lro_tcp_d_len, 842 le->m_head->m_pkthdr.lro_tcp_d_csum); 843 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 844 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 845 le->m_head->m_pkthdr.csum_data = 0xffff; 846 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 847 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 848 break; 849 case LRO_TYPE_IPV6_TCP: 850 csum = tcp_lro_update_checksum(&le->outer, le, 851 le->m_head->m_pkthdr.lro_tcp_d_len, 852 le->m_head->m_pkthdr.lro_tcp_d_csum); 853 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 854 CSUM_PSEUDO_HDR; 855 le->m_head->m_pkthdr.csum_data = 0xffff; 856 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 857 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 858 break; 859 default: 860 break; 861 } 862 break; 863 default: 864 break; 865 } 866 } 867 868 /* 869 * Break any chain, this is not set to NULL on the singleton 870 * case m_nextpkt points to m_head. Other case set them 871 * m_nextpkt to NULL in push_and_replace. 872 */ 873 le->m_head->m_nextpkt = NULL; 874 lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs; 875 (*lc->ifp->if_input)(lc->ifp, le->m_head); 876 } 877 878 static void 879 tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le, 880 struct mbuf *m, struct tcphdr *th) 881 { 882 uint32_t *ts_ptr; 883 uint16_t tcp_data_len; 884 uint16_t tcp_opt_len; 885 886 ts_ptr = (uint32_t *)(th + 1); 887 tcp_opt_len = (th->th_off << 2); 888 tcp_opt_len -= sizeof(*th); 889 890 /* Check if there is a timestamp option. */ 891 if (tcp_opt_len == 0 || 892 __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || 893 *ts_ptr != TCP_LRO_TS_OPTION)) { 894 /* We failed to find the timestamp option. */ 895 le->timestamp = 0; 896 } else { 897 le->timestamp = 1; 898 le->tsval = ntohl(*(ts_ptr + 1)); 899 le->tsecr = *(ts_ptr + 2); 900 } 901 902 tcp_data_len = m->m_pkthdr.lro_tcp_d_len; 903 904 /* Pull out TCP sequence numbers and window size. */ 905 le->next_seq = ntohl(th->th_seq) + tcp_data_len; 906 le->ack_seq = th->th_ack; 907 le->window = th->th_win; 908 le->flags = tcp_get_flags(th); 909 le->needs_merge = 0; 910 911 /* Setup new data pointers. */ 912 le->m_head = m; 913 le->m_tail = m_last(m); 914 } 915 916 static void 917 tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 918 { 919 struct lro_parser *pa; 920 921 /* 922 * Push up the stack of the current entry 923 * and replace it with "m". 924 */ 925 struct mbuf *msave; 926 927 /* Grab off the next and save it */ 928 msave = le->m_head->m_nextpkt; 929 le->m_head->m_nextpkt = NULL; 930 931 /* Now push out the old entry */ 932 tcp_flush_out_entry(lc, le); 933 934 /* Re-parse new header, should not fail. */ 935 pa = tcp_lro_parser(m, &le->outer, &le->inner, false); 936 KASSERT(pa != NULL, 937 ("tcp_push_and_replace: LRO parser failed on m=%p\n", m)); 938 939 /* 940 * Now to replace the data properly in the entry 941 * we have to reset the TCP header and 942 * other fields. 943 */ 944 tcp_set_entry_to_mbuf(lc, le, m, pa->tcp); 945 946 /* Restore the next list */ 947 m->m_nextpkt = msave; 948 } 949 950 static void 951 tcp_lro_mbuf_append_pkthdr(struct lro_entry *le, const struct mbuf *p) 952 { 953 struct mbuf *m; 954 uint32_t csum; 955 956 m = le->m_head; 957 if (m->m_pkthdr.lro_nsegs == 1) { 958 /* Compute relative checksum. */ 959 csum = p->m_pkthdr.lro_tcp_d_csum; 960 } else { 961 /* Merge TCP data checksums. */ 962 csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum + 963 (uint32_t)p->m_pkthdr.lro_tcp_d_csum; 964 while (csum > 0xffff) 965 csum = (csum >> 16) + (csum & 0xffff); 966 } 967 968 /* Update various counters. */ 969 m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len; 970 m->m_pkthdr.lro_tcp_d_csum = csum; 971 m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len; 972 m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs; 973 le->needs_merge = 1; 974 } 975 976 static void 977 tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le) 978 { 979 /* 980 * Walk through the mbuf chain we 981 * have on tap and compress/condense 982 * as required. 983 */ 984 uint32_t *ts_ptr; 985 struct mbuf *m; 986 struct tcphdr *th; 987 uint32_t tcp_data_len_total; 988 uint32_t tcp_data_seg_total; 989 uint16_t tcp_data_len; 990 uint16_t tcp_opt_len; 991 992 /* 993 * First we must check the lead (m_head) 994 * we must make sure that it is *not* 995 * something that should be sent up 996 * right away (sack etc). 997 */ 998 again: 999 m = le->m_head->m_nextpkt; 1000 if (m == NULL) { 1001 /* Just one left. */ 1002 return; 1003 } 1004 1005 th = tcp_lro_get_th(m); 1006 tcp_opt_len = (th->th_off << 2); 1007 tcp_opt_len -= sizeof(*th); 1008 ts_ptr = (uint32_t *)(th + 1); 1009 1010 if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || 1011 *ts_ptr != TCP_LRO_TS_OPTION)) { 1012 /* 1013 * Its not the timestamp. We can't 1014 * use this guy as the head. 1015 */ 1016 le->m_head->m_nextpkt = m->m_nextpkt; 1017 tcp_push_and_replace(lc, le, m); 1018 goto again; 1019 } 1020 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) { 1021 /* 1022 * Make sure that previously seen segments/ACKs are delivered 1023 * before this segment, e.g. FIN. 1024 */ 1025 le->m_head->m_nextpkt = m->m_nextpkt; 1026 tcp_push_and_replace(lc, le, m); 1027 goto again; 1028 } 1029 while((m = le->m_head->m_nextpkt) != NULL) { 1030 /* 1031 * condense m into le, first 1032 * pull m out of the list. 1033 */ 1034 le->m_head->m_nextpkt = m->m_nextpkt; 1035 m->m_nextpkt = NULL; 1036 /* Setup my data */ 1037 tcp_data_len = m->m_pkthdr.lro_tcp_d_len; 1038 th = tcp_lro_get_th(m); 1039 ts_ptr = (uint32_t *)(th + 1); 1040 tcp_opt_len = (th->th_off << 2); 1041 tcp_opt_len -= sizeof(*th); 1042 tcp_data_len_total = le->m_head->m_pkthdr.lro_tcp_d_len + tcp_data_len; 1043 tcp_data_seg_total = le->m_head->m_pkthdr.lro_nsegs + m->m_pkthdr.lro_nsegs; 1044 1045 if (tcp_data_seg_total >= lc->lro_ackcnt_lim || 1046 tcp_data_len_total >= lc->lro_length_lim) { 1047 /* Flush now if appending will result in overflow. */ 1048 tcp_push_and_replace(lc, le, m); 1049 goto again; 1050 } 1051 if (tcp_opt_len != 0 && 1052 __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || 1053 *ts_ptr != TCP_LRO_TS_OPTION)) { 1054 /* 1055 * Maybe a sack in the new one? We need to 1056 * start all over after flushing the 1057 * current le. We will go up to the beginning 1058 * and flush it (calling the replace again possibly 1059 * or just returning). 1060 */ 1061 tcp_push_and_replace(lc, le, m); 1062 goto again; 1063 } 1064 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) { 1065 tcp_push_and_replace(lc, le, m); 1066 goto again; 1067 } 1068 if (tcp_opt_len != 0) { 1069 uint32_t tsval = ntohl(*(ts_ptr + 1)); 1070 /* Make sure timestamp values are increasing. */ 1071 if (TSTMP_GT(le->tsval, tsval)) { 1072 tcp_push_and_replace(lc, le, m); 1073 goto again; 1074 } 1075 le->tsval = tsval; 1076 le->tsecr = *(ts_ptr + 2); 1077 } 1078 /* Try to append the new segment. */ 1079 if (__predict_false(ntohl(th->th_seq) != le->next_seq || 1080 ((tcp_get_flags(th) & TH_ACK) != 1081 (le->flags & TH_ACK)) || 1082 (tcp_data_len == 0 && 1083 le->ack_seq == th->th_ack && 1084 le->window == th->th_win))) { 1085 /* Out of order packet, non-ACK + ACK or dup ACK. */ 1086 tcp_push_and_replace(lc, le, m); 1087 goto again; 1088 } 1089 if (tcp_data_len != 0 || 1090 SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 1091 le->next_seq += tcp_data_len; 1092 le->ack_seq = th->th_ack; 1093 le->window = th->th_win; 1094 le->needs_merge = 1; 1095 } else if (th->th_ack == le->ack_seq) { 1096 if (WIN_GT(th->th_win, le->window)) { 1097 le->window = th->th_win; 1098 le->needs_merge = 1; 1099 } 1100 } 1101 1102 if (tcp_data_len == 0) { 1103 m_freem(m); 1104 continue; 1105 } 1106 1107 /* Merge TCP data checksum and length to head mbuf. */ 1108 tcp_lro_mbuf_append_pkthdr(le, m); 1109 1110 /* 1111 * Adjust the mbuf so that m_data points to the first byte of 1112 * the ULP payload. Adjust the mbuf to avoid complications and 1113 * append new segment to existing mbuf chain. 1114 */ 1115 m_adj(m, m->m_pkthdr.len - tcp_data_len); 1116 m_demote_pkthdr(m); 1117 le->m_tail->m_next = m; 1118 le->m_tail = m_last(m); 1119 } 1120 } 1121 1122 static void 1123 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 1124 { 1125 1126 /* Only optimise if there are multiple packets waiting. */ 1127 NET_EPOCH_ASSERT(); 1128 if (tcp_lro_flush_tcphpts == NULL || 1129 tcp_lro_flush_tcphpts(lc, le) != 0) { 1130 tcp_lro_condense(lc, le); 1131 tcp_flush_out_entry(lc, le); 1132 } 1133 lc->lro_flushed++; 1134 bzero(le, sizeof(*le)); 1135 LIST_INSERT_HEAD(&lc->lro_free, le, next); 1136 } 1137 1138 #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) 1139 1140 /* 1141 * The tcp_lro_sort() routine is comparable to qsort(), except it has 1142 * a worst case complexity limit of O(MIN(N,64)*N), where N is the 1143 * number of elements to sort and 64 is the number of sequence bits 1144 * available. The algorithm is bit-slicing the 64-bit sequence number, 1145 * sorting one bit at a time from the most significant bit until the 1146 * least significant one, skipping the constant bits. This is 1147 * typically called a radix sort. 1148 */ 1149 static void 1150 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) 1151 { 1152 struct lro_mbuf_sort temp; 1153 uint64_t ones; 1154 uint64_t zeros; 1155 uint32_t x; 1156 uint32_t y; 1157 1158 repeat: 1159 /* for small arrays insertion sort is faster */ 1160 if (size <= 12) { 1161 for (x = 1; x < size; x++) { 1162 temp = parray[x]; 1163 for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) 1164 parray[y] = parray[y - 1]; 1165 parray[y] = temp; 1166 } 1167 return; 1168 } 1169 1170 /* compute sequence bits which are constant */ 1171 ones = 0; 1172 zeros = 0; 1173 for (x = 0; x != size; x++) { 1174 ones |= parray[x].seq; 1175 zeros |= ~parray[x].seq; 1176 } 1177 1178 /* compute bits which are not constant into "ones" */ 1179 ones &= zeros; 1180 if (ones == 0) 1181 return; 1182 1183 /* pick the most significant bit which is not constant */ 1184 ones = tcp_lro_msb_64(ones); 1185 1186 /* 1187 * Move entries having cleared sequence bits to the beginning 1188 * of the array: 1189 */ 1190 for (x = y = 0; y != size; y++) { 1191 /* skip set bits */ 1192 if (parray[y].seq & ones) 1193 continue; 1194 /* swap entries */ 1195 temp = parray[x]; 1196 parray[x] = parray[y]; 1197 parray[y] = temp; 1198 x++; 1199 } 1200 1201 KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); 1202 1203 /* sort zeros */ 1204 tcp_lro_sort(parray, x); 1205 1206 /* sort ones */ 1207 parray += x; 1208 size -= x; 1209 goto repeat; 1210 } 1211 1212 void 1213 tcp_lro_flush_all(struct lro_ctrl *lc) 1214 { 1215 uint64_t seq; 1216 uint64_t nseq; 1217 unsigned x; 1218 1219 NET_EPOCH_ASSERT(); 1220 /* check if no mbufs to flush */ 1221 if (lc->lro_mbuf_count == 0) 1222 goto done; 1223 if (lc->lro_cpu_is_set == 0) { 1224 if (lc->lro_last_cpu == curcpu) { 1225 lc->lro_cnt_of_same_cpu++; 1226 /* Have we reached the threshold to declare a cpu? */ 1227 if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh) 1228 lc->lro_cpu_is_set = 1; 1229 } else { 1230 lc->lro_last_cpu = curcpu; 1231 lc->lro_cnt_of_same_cpu = 0; 1232 } 1233 } 1234 CURVNET_SET(lc->ifp->if_vnet); 1235 1236 /* get current time */ 1237 binuptime(&lc->lro_last_queue_time); 1238 1239 /* sort all mbufs according to stream */ 1240 tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); 1241 1242 /* input data into LRO engine, stream by stream */ 1243 seq = 0; 1244 for (x = 0; x != lc->lro_mbuf_count; x++) { 1245 struct mbuf *mb; 1246 1247 /* get mbuf */ 1248 mb = lc->lro_mbuf_data[x].mb; 1249 1250 /* get sequence number, masking away the packet index */ 1251 nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); 1252 1253 /* check for new stream */ 1254 if (seq != nseq) { 1255 seq = nseq; 1256 1257 /* flush active streams */ 1258 tcp_lro_rx_done(lc); 1259 } 1260 1261 /* add packet to LRO engine */ 1262 if (tcp_lro_rx_common(lc, mb, 0, false) != 0) { 1263 /* Flush anything we have acummulated */ 1264 tcp_lro_flush_active(lc); 1265 /* input packet to network layer */ 1266 (*lc->ifp->if_input)(lc->ifp, mb); 1267 lc->lro_queued++; 1268 lc->lro_flushed++; 1269 } 1270 } 1271 CURVNET_RESTORE(); 1272 done: 1273 /* flush active streams */ 1274 tcp_lro_rx_done(lc); 1275 if (tcp_hpts_softclock != NULL) 1276 tcp_hpts_softclock(); 1277 lc->lro_mbuf_count = 0; 1278 } 1279 1280 static struct lro_head * 1281 tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser) 1282 { 1283 u_long hash; 1284 1285 if (M_HASHTYPE_ISHASH(m)) { 1286 hash = m->m_pkthdr.flowid; 1287 } else { 1288 for (unsigned i = hash = 0; i != LRO_RAW_ADDRESS_MAX; i++) 1289 hash += parser->data.raw[i]; 1290 } 1291 return (&lc->lro_hash[hash % lc->lro_hashsz]); 1292 } 1293 1294 static int 1295 tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash) 1296 { 1297 struct lro_parser pi; /* inner address data */ 1298 struct lro_parser po; /* outer address data */ 1299 struct lro_parser *pa; /* current parser for TCP stream */ 1300 struct lro_entry *le; 1301 struct lro_head *bucket; 1302 struct tcphdr *th; 1303 int tcp_data_len; 1304 int tcp_opt_len; 1305 int error; 1306 uint16_t tcp_data_sum; 1307 1308 /* We expect a contiguous header [eh, ip, tcp]. */ 1309 pa = tcp_lro_parser(m, &po, &pi, true); 1310 if (__predict_false(pa == NULL)) 1311 return (TCP_LRO_NOT_SUPPORTED); 1312 1313 /* We don't expect any padding. */ 1314 error = tcp_lro_trim_mbuf_chain(m, pa); 1315 if (__predict_false(error != 0)) 1316 return (error); 1317 1318 #ifdef INET 1319 switch (pa->data.lro_type) { 1320 case LRO_TYPE_IPV4_TCP: 1321 error = tcp_lro_rx_ipv4(lc, m, pa->ip4); 1322 if (__predict_false(error != 0)) 1323 return (error); 1324 break; 1325 default: 1326 break; 1327 } 1328 #endif 1329 /* If no hardware or arrival stamp on the packet add timestamp */ 1330 if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) { 1331 m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 1332 m->m_flags |= M_TSTMP_LRO; 1333 } 1334 1335 /* Get pointer to TCP header. */ 1336 th = pa->tcp; 1337 1338 /* Don't process SYN packets. */ 1339 if (__predict_false(tcp_get_flags(th) & TH_SYN)) 1340 return (TCP_LRO_CANNOT); 1341 1342 /* Get total TCP header length and compute payload length. */ 1343 tcp_opt_len = (th->th_off << 2); 1344 tcp_data_len = m->m_pkthdr.len - ((uint8_t *)th - 1345 (uint8_t *)m->m_data) - tcp_opt_len; 1346 tcp_opt_len -= sizeof(*th); 1347 1348 /* Don't process invalid TCP headers. */ 1349 if (__predict_false(tcp_opt_len < 0 || tcp_data_len < 0)) 1350 return (TCP_LRO_CANNOT); 1351 1352 /* Compute TCP data only checksum. */ 1353 if (tcp_data_len == 0) 1354 tcp_data_sum = 0; /* no data, no checksum */ 1355 else if (__predict_false(csum != 0)) 1356 tcp_data_sum = tcp_lro_rx_csum_data(pa, ~csum); 1357 else 1358 tcp_data_sum = tcp_lro_rx_csum_data(pa, ~th->th_sum); 1359 1360 /* Save TCP info in mbuf. */ 1361 m->m_nextpkt = NULL; 1362 m->m_pkthdr.rcvif = lc->ifp; 1363 m->m_pkthdr.lro_tcp_d_csum = tcp_data_sum; 1364 m->m_pkthdr.lro_tcp_d_len = tcp_data_len; 1365 m->m_pkthdr.lro_tcp_h_off = ((uint8_t *)th - (uint8_t *)m->m_data); 1366 m->m_pkthdr.lro_nsegs = 1; 1367 1368 /* Get hash bucket. */ 1369 if (!use_hash) { 1370 bucket = &lc->lro_hash[0]; 1371 } else { 1372 bucket = tcp_lro_rx_get_bucket(lc, m, pa); 1373 } 1374 1375 /* Try to find a matching previous segment. */ 1376 LIST_FOREACH(le, bucket, hash_next) { 1377 /* Compare addresses and ports. */ 1378 if (lro_address_compare(&po.data, &le->outer.data) == false || 1379 lro_address_compare(&pi.data, &le->inner.data) == false) 1380 continue; 1381 1382 /* Check if no data and old ACK. */ 1383 if (tcp_data_len == 0 && 1384 SEQ_LT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 1385 m_freem(m); 1386 return (0); 1387 } 1388 1389 /* Mark "m" in the last spot. */ 1390 le->m_last_mbuf->m_nextpkt = m; 1391 /* Now set the tail to "m". */ 1392 le->m_last_mbuf = m; 1393 return (0); 1394 } 1395 1396 /* Try to find an empty slot. */ 1397 if (LIST_EMPTY(&lc->lro_free)) 1398 return (TCP_LRO_NO_ENTRIES); 1399 1400 /* Start a new segment chain. */ 1401 le = LIST_FIRST(&lc->lro_free); 1402 LIST_REMOVE(le, next); 1403 tcp_lro_active_insert(lc, bucket, le); 1404 1405 /* Make sure the headers are set. */ 1406 le->inner = pi; 1407 le->outer = po; 1408 1409 /* Store time this entry was allocated. */ 1410 le->alloc_time = lc->lro_last_queue_time; 1411 1412 tcp_set_entry_to_mbuf(lc, le, m, th); 1413 1414 /* Now set the tail to "m". */ 1415 le->m_last_mbuf = m; 1416 1417 return (0); 1418 } 1419 1420 int 1421 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 1422 { 1423 int error; 1424 1425 CURVNET_SET(lc->ifp->if_vnet); 1426 #ifdef INET 1427 /* Quickly decide if packet cannot be LRO'ed */ 1428 if (__predict_false(V_ipforwarding != 0)) { 1429 CURVNET_RESTORE(); 1430 return (TCP_LRO_CANNOT); 1431 } 1432 #endif 1433 #ifdef INET6 1434 /* Quickly decide if packet cannot be LRO'ed */ 1435 if (__predict_false(V_ip6_forwarding != 0)) { 1436 CURVNET_RESTORE(); 1437 return (TCP_LRO_CANNOT); 1438 } 1439 #endif 1440 1441 if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != 1442 ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 1443 (m->m_pkthdr.csum_data != 0xffff)) { 1444 /* 1445 * The checksum either did not have hardware offload 1446 * or it was a bad checksum. We can't LRO such 1447 * a packet. 1448 */ 1449 counter_u64_add(tcp_bad_csums, 1); 1450 CURVNET_RESTORE(); 1451 return (TCP_LRO_CANNOT); 1452 } 1453 1454 /* get current time */ 1455 binuptime(&lc->lro_last_queue_time); 1456 error = tcp_lro_rx_common(lc, m, csum, true); 1457 if (__predict_false(error != 0)) { 1458 /* 1459 * Flush anything we have acummulated 1460 * ahead of this packet that can't 1461 * be LRO'd. This preserves order. 1462 */ 1463 tcp_lro_flush_active(lc); 1464 } 1465 CURVNET_RESTORE(); 1466 1467 return (error); 1468 } 1469 1470 void 1471 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) 1472 { 1473 NET_EPOCH_ASSERT(); 1474 /* sanity checks */ 1475 if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || 1476 lc->lro_mbuf_max == 0)) { 1477 /* packet drop */ 1478 m_freem(mb); 1479 return; 1480 } 1481 1482 CURVNET_SET(lc->ifp->if_vnet); 1483 #ifdef INET 1484 /* Quickly decide if packet cannot be LRO'ed */ 1485 if (__predict_false(V_ipforwarding != 0)) { 1486 /* input packet to network layer */ 1487 CURVNET_RESTORE(); 1488 (*lc->ifp->if_input) (lc->ifp, mb); 1489 return; 1490 } 1491 #endif 1492 #ifdef INET6 1493 /* Quickly decide if packet cannot be LRO'ed */ 1494 if (__predict_false(V_ip6_forwarding != 0)) { 1495 /* input packet to network layer */ 1496 CURVNET_RESTORE(); 1497 (*lc->ifp->if_input) (lc->ifp, mb); 1498 return; 1499 } 1500 #endif 1501 CURVNET_RESTORE(); 1502 /* check if packet is not LRO capable */ 1503 if (__predict_false((lc->ifp->if_capenable & IFCAP_LRO) == 0)) { 1504 /* input packet to network layer */ 1505 (*lc->ifp->if_input) (lc->ifp, mb); 1506 return; 1507 } 1508 1509 if (((mb->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != 1510 ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 1511 (mb->m_pkthdr.csum_data != 0xffff)) { 1512 /* 1513 * The checksum either did not have hardware offload 1514 * or it was a bad checksum. We can't LRO such 1515 * a packet. 1516 */ 1517 counter_u64_add(tcp_bad_csums, 1); 1518 (*lc->ifp->if_input) (lc->ifp, mb); 1519 return; 1520 } 1521 1522 /* If no hardware or arrival stamp on the packet add timestamp */ 1523 if ((tcplro_stacks_wanting_mbufq > 0) && 1524 (tcp_less_accurate_lro_ts == 0) && 1525 ((mb->m_flags & M_TSTMP) == 0)) { 1526 /* Add in an LRO time since no hardware */ 1527 binuptime(&lc->lro_last_queue_time); 1528 mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 1529 mb->m_flags |= M_TSTMP_LRO; 1530 } 1531 1532 /* create sequence number */ 1533 lc->lro_mbuf_data[lc->lro_mbuf_count].seq = lc->lro_mbuf_count; 1534 if (M_HASHTYPE_ISHASH(mb)) 1535 lc->lro_mbuf_data[lc->lro_mbuf_count].seq |= 1536 (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | 1537 (((uint64_t)mb->m_pkthdr.flowid) << 24); 1538 1539 /* enter mbuf */ 1540 lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; 1541 1542 /* flush if array is full */ 1543 if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) 1544 tcp_lro_flush_all(lc); 1545 } 1546 1547 /* end */ 1548