1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2007, Myricom Inc. 5 * Copyright (c) 2008, Intel Corporation. 6 * Copyright (c) 2012 The FreeBSD Foundation 7 * Copyright (c) 2016-2021 Mellanox Technologies. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Bjoern Zeeb 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sockbuf.h> 47 #include <sys/sysctl.h> 48 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/ethernet.h> 52 #include <net/bpf.h> 53 #include <net/vnet.h> 54 #include <net/if_dl.h> 55 #include <net/if_media.h> 56 #include <net/if_private.h> 57 #include <net/if_types.h> 58 #include <net/infiniband.h> 59 #include <net/if_lagg.h> 60 61 #include <netinet/in_systm.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/ip.h> 65 #include <netinet/ip_var.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet6/in6_pcb.h> 68 #include <netinet/tcp.h> 69 #include <netinet/tcp_seq.h> 70 #include <netinet/tcp_lro.h> 71 #include <netinet/tcp_var.h> 72 #include <netinet/tcpip.h> 73 #include <netinet/tcp_hpts.h> 74 #include <netinet/tcp_log_buf.h> 75 #include <netinet/tcp_fsm.h> 76 #include <netinet/udp.h> 77 #include <netinet6/ip6_var.h> 78 79 #include <machine/in_cksum.h> 80 81 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); 82 83 static void tcp_lro_rx_done(struct lro_ctrl *lc); 84 static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, 85 uint32_t csum, bool use_hash); 86 87 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 88 "TCP LRO"); 89 90 long tcplro_stacks_wanting_mbufq; 91 int (*tcp_lro_flush_tcphpts)(struct lro_ctrl *lc, struct lro_entry *le); 92 93 counter_u64_t tcp_inp_lro_direct_queue; 94 counter_u64_t tcp_inp_lro_wokeup_queue; 95 counter_u64_t tcp_inp_lro_compressed; 96 counter_u64_t tcp_inp_lro_locks_taken; 97 counter_u64_t tcp_extra_mbuf; 98 counter_u64_t tcp_would_have_but; 99 counter_u64_t tcp_comp_total; 100 counter_u64_t tcp_uncomp_total; 101 counter_u64_t tcp_bad_csums; 102 103 static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; 104 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, 105 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, 106 "default number of LRO entries"); 107 108 static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH; 109 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold, 110 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0, 111 "Number of interrupts in a row on the same CPU that will make us declare an 'affinity' cpu?"); 112 113 static uint32_t tcp_less_accurate_lro_ts = 0; 114 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_less_accurate, 115 CTLFLAG_MPSAFE, &tcp_less_accurate_lro_ts, 0, 116 "Do we trade off efficency by doing less timestamp operations for time accuracy?"); 117 118 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, 119 &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); 120 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, 121 &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); 122 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, 123 &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); 124 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, 125 &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); 126 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, 127 &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); 128 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, 129 &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed"); 130 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, 131 &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); 132 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, 133 &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); 134 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lro_badcsum, CTLFLAG_RD, 135 &tcp_bad_csums, "Number of packets that the common code saw with bad csums"); 136 137 void 138 tcp_lro_reg_mbufq(void) 139 { 140 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); 141 } 142 143 void 144 tcp_lro_dereg_mbufq(void) 145 { 146 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); 147 } 148 149 static __inline void 150 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, 151 struct lro_entry *le) 152 { 153 154 LIST_INSERT_HEAD(&lc->lro_active, le, next); 155 LIST_INSERT_HEAD(bucket, le, hash_next); 156 } 157 158 static __inline void 159 tcp_lro_active_remove(struct lro_entry *le) 160 { 161 162 LIST_REMOVE(le, next); /* active list */ 163 LIST_REMOVE(le, hash_next); /* hash bucket */ 164 } 165 166 int 167 tcp_lro_init(struct lro_ctrl *lc) 168 { 169 return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); 170 } 171 172 int 173 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, 174 unsigned lro_entries, unsigned lro_mbufs) 175 { 176 struct lro_entry *le; 177 size_t size; 178 unsigned i; 179 180 lc->lro_bad_csum = 0; 181 lc->lro_queued = 0; 182 lc->lro_flushed = 0; 183 lc->lro_mbuf_count = 0; 184 lc->lro_mbuf_max = lro_mbufs; 185 lc->lro_cnt = lro_entries; 186 lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; 187 lc->lro_length_lim = TCP_LRO_LENGTH_MAX; 188 lc->ifp = ifp; 189 LIST_INIT(&lc->lro_free); 190 LIST_INIT(&lc->lro_active); 191 192 /* create hash table to accelerate entry lookup */ 193 lc->lro_hash = phashinit_flags(lro_entries, M_LRO, &lc->lro_hashsz, 194 HASH_NOWAIT); 195 if (lc->lro_hash == NULL) { 196 memset(lc, 0, sizeof(*lc)); 197 return (ENOMEM); 198 } 199 200 /* compute size to allocate */ 201 size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + 202 (lro_entries * sizeof(*le)); 203 lc->lro_mbuf_data = (struct lro_mbuf_sort *) 204 malloc(size, M_LRO, M_NOWAIT | M_ZERO); 205 206 /* check for out of memory */ 207 if (lc->lro_mbuf_data == NULL) { 208 free(lc->lro_hash, M_LRO); 209 memset(lc, 0, sizeof(*lc)); 210 return (ENOMEM); 211 } 212 /* compute offset for LRO entries */ 213 le = (struct lro_entry *) 214 (lc->lro_mbuf_data + lro_mbufs); 215 216 /* setup linked list */ 217 for (i = 0; i != lro_entries; i++) 218 LIST_INSERT_HEAD(&lc->lro_free, le + i, next); 219 220 return (0); 221 } 222 223 struct vxlan_header { 224 uint32_t vxlh_flags; 225 uint32_t vxlh_vni; 226 }; 227 228 static inline void * 229 tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan, int mlen) 230 { 231 const struct ether_vlan_header *eh; 232 void *old; 233 uint16_t eth_type; 234 235 if (update_data) 236 memset(parser, 0, sizeof(*parser)); 237 238 old = ptr; 239 240 if (is_vxlan) { 241 const struct vxlan_header *vxh; 242 vxh = ptr; 243 ptr = (uint8_t *)ptr + sizeof(*vxh); 244 if (update_data) { 245 parser->data.vxlan_vni = 246 vxh->vxlh_vni & htonl(0xffffff00); 247 } 248 } 249 250 eh = ptr; 251 if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) { 252 eth_type = eh->evl_proto; 253 if (update_data) { 254 /* strip priority and keep VLAN ID only */ 255 parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK); 256 } 257 /* advance to next header */ 258 ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 259 mlen -= (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 260 } else { 261 eth_type = eh->evl_encap_proto; 262 /* advance to next header */ 263 mlen -= ETHER_HDR_LEN; 264 ptr = (uint8_t *)ptr + ETHER_HDR_LEN; 265 } 266 if (__predict_false(mlen <= 0)) 267 return (NULL); 268 switch (eth_type) { 269 #ifdef INET 270 case htons(ETHERTYPE_IP): 271 parser->ip4 = ptr; 272 if (__predict_false(mlen < sizeof(struct ip))) 273 return (NULL); 274 /* Ensure there are no IPv4 options. */ 275 if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4)) 276 break; 277 /* .. and the packet is not fragmented. */ 278 if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 279 break; 280 /* .. and the packet has valid src/dst addrs */ 281 if (__predict_false(parser->ip4->ip_src.s_addr == INADDR_ANY || 282 parser->ip4->ip_dst.s_addr == INADDR_ANY)) 283 break; 284 ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2); 285 mlen -= sizeof(struct ip); 286 if (update_data) { 287 parser->data.s_addr.v4 = parser->ip4->ip_src; 288 parser->data.d_addr.v4 = parser->ip4->ip_dst; 289 } 290 switch (parser->ip4->ip_p) { 291 case IPPROTO_UDP: 292 if (__predict_false(mlen < sizeof(struct udphdr))) 293 return (NULL); 294 parser->udp = ptr; 295 if (update_data) { 296 parser->data.lro_type = LRO_TYPE_IPV4_UDP; 297 parser->data.s_port = parser->udp->uh_sport; 298 parser->data.d_port = parser->udp->uh_dport; 299 } else { 300 MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP); 301 } 302 ptr = ((uint8_t *)ptr + sizeof(*parser->udp)); 303 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 304 return (ptr); 305 case IPPROTO_TCP: 306 parser->tcp = ptr; 307 if (__predict_false(mlen < sizeof(struct tcphdr))) 308 return (NULL); 309 if (update_data) { 310 parser->data.lro_type = LRO_TYPE_IPV4_TCP; 311 parser->data.s_port = parser->tcp->th_sport; 312 parser->data.d_port = parser->tcp->th_dport; 313 } else { 314 MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP); 315 } 316 if (__predict_false(mlen < (parser->tcp->th_off << 2))) 317 return (NULL); 318 ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); 319 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 320 return (ptr); 321 default: 322 break; 323 } 324 break; 325 #endif 326 #ifdef INET6 327 case htons(ETHERTYPE_IPV6): 328 parser->ip6 = ptr; 329 if (__predict_false(mlen < sizeof(struct ip6_hdr))) 330 return (NULL); 331 /* Ensure the packet has valid src/dst addrs */ 332 if (__predict_false(IN6_IS_ADDR_UNSPECIFIED(&parser->ip6->ip6_src) || 333 IN6_IS_ADDR_UNSPECIFIED(&parser->ip6->ip6_dst))) 334 return (NULL); 335 ptr = (uint8_t *)ptr + sizeof(*parser->ip6); 336 if (update_data) { 337 parser->data.s_addr.v6 = parser->ip6->ip6_src; 338 parser->data.d_addr.v6 = parser->ip6->ip6_dst; 339 } 340 mlen -= sizeof(struct ip6_hdr); 341 switch (parser->ip6->ip6_nxt) { 342 case IPPROTO_UDP: 343 if (__predict_false(mlen < sizeof(struct udphdr))) 344 return (NULL); 345 parser->udp = ptr; 346 if (update_data) { 347 parser->data.lro_type = LRO_TYPE_IPV6_UDP; 348 parser->data.s_port = parser->udp->uh_sport; 349 parser->data.d_port = parser->udp->uh_dport; 350 } else { 351 MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP); 352 } 353 ptr = (uint8_t *)ptr + sizeof(*parser->udp); 354 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 355 return (ptr); 356 case IPPROTO_TCP: 357 if (__predict_false(mlen < sizeof(struct tcphdr))) 358 return (NULL); 359 parser->tcp = ptr; 360 if (update_data) { 361 parser->data.lro_type = LRO_TYPE_IPV6_TCP; 362 parser->data.s_port = parser->tcp->th_sport; 363 parser->data.d_port = parser->tcp->th_dport; 364 } else { 365 MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP); 366 } 367 if (__predict_false(mlen < (parser->tcp->th_off << 2))) 368 return (NULL); 369 ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); 370 parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; 371 return (ptr); 372 default: 373 break; 374 } 375 break; 376 #endif 377 default: 378 break; 379 } 380 /* Invalid packet - cannot parse */ 381 return (NULL); 382 } 383 384 static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | 385 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID; 386 387 static inline struct lro_parser * 388 tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data) 389 { 390 void *data_ptr; 391 392 /* Try to parse outer headers first. */ 393 data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false, m->m_len); 394 if (data_ptr == NULL || po->total_hdr_len > m->m_len) 395 return (NULL); 396 397 if (update_data) { 398 /* Store VLAN ID, if any. */ 399 if (__predict_false(m->m_flags & M_VLANTAG)) { 400 po->data.vlan_id = 401 htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK); 402 } 403 /* Store decrypted flag, if any. */ 404 if (__predict_false((m->m_pkthdr.csum_flags & 405 CSUM_TLS_MASK) == CSUM_TLS_DECRYPTED)) 406 po->data.lro_flags |= LRO_FLAG_DECRYPTED; 407 } 408 409 switch (po->data.lro_type) { 410 case LRO_TYPE_IPV4_UDP: 411 case LRO_TYPE_IPV6_UDP: 412 /* Check for VXLAN headers. */ 413 if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum) 414 break; 415 416 /* Try to parse inner headers. */ 417 data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true, 418 (m->m_len - ((caddr_t)data_ptr - m->m_data))); 419 if (data_ptr == NULL || (pi->total_hdr_len + po->total_hdr_len) > m->m_len) 420 break; 421 422 /* Verify supported header types. */ 423 switch (pi->data.lro_type) { 424 case LRO_TYPE_IPV4_TCP: 425 case LRO_TYPE_IPV6_TCP: 426 return (pi); 427 default: 428 break; 429 } 430 break; 431 case LRO_TYPE_IPV4_TCP: 432 case LRO_TYPE_IPV6_TCP: 433 if (update_data) 434 memset(pi, 0, sizeof(*pi)); 435 return (po); 436 default: 437 break; 438 } 439 return (NULL); 440 } 441 442 static inline int 443 tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po) 444 { 445 int len; 446 447 switch (po->data.lro_type) { 448 #ifdef INET 449 case LRO_TYPE_IPV4_TCP: 450 len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) + 451 ntohs(po->ip4->ip_len); 452 break; 453 #endif 454 #ifdef INET6 455 case LRO_TYPE_IPV6_TCP: 456 len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) + 457 ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6); 458 break; 459 #endif 460 default: 461 return (TCP_LRO_CANNOT); 462 } 463 464 /* 465 * If the frame is padded beyond the end of the IP packet, 466 * then trim the extra bytes off: 467 */ 468 if (__predict_true(m->m_pkthdr.len == len)) { 469 return (0); 470 } else if (m->m_pkthdr.len > len) { 471 m_adj(m, len - m->m_pkthdr.len); 472 return (0); 473 } 474 return (TCP_LRO_CANNOT); 475 } 476 477 static void 478 lro_free_mbuf_chain(struct mbuf *m) 479 { 480 struct mbuf *save; 481 482 while (m) { 483 save = m->m_nextpkt; 484 m->m_nextpkt = NULL; 485 m_freem(m); 486 m = save; 487 } 488 } 489 490 void 491 tcp_lro_free(struct lro_ctrl *lc) 492 { 493 struct lro_entry *le; 494 unsigned x; 495 496 /* reset LRO free list */ 497 LIST_INIT(&lc->lro_free); 498 499 /* free active mbufs, if any */ 500 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 501 tcp_lro_active_remove(le); 502 lro_free_mbuf_chain(le->m_head); 503 } 504 505 /* free hash table */ 506 free(lc->lro_hash, M_LRO); 507 lc->lro_hash = NULL; 508 lc->lro_hashsz = 0; 509 510 /* free mbuf array, if any */ 511 for (x = 0; x != lc->lro_mbuf_count; x++) 512 m_freem(lc->lro_mbuf_data[x].mb); 513 lc->lro_mbuf_count = 0; 514 515 /* free allocated memory, if any */ 516 free(lc->lro_mbuf_data, M_LRO); 517 lc->lro_mbuf_data = NULL; 518 } 519 520 static uint16_t 521 tcp_lro_rx_csum_tcphdr(const struct tcphdr *th) 522 { 523 const uint16_t *ptr; 524 uint32_t csum; 525 uint16_t len; 526 527 csum = -th->th_sum; /* exclude checksum field */ 528 len = th->th_off; 529 ptr = (const uint16_t *)th; 530 while (len--) { 531 csum += *ptr; 532 ptr++; 533 csum += *ptr; 534 ptr++; 535 } 536 while (csum > 0xffff) 537 csum = (csum >> 16) + (csum & 0xffff); 538 539 return (csum); 540 } 541 542 static uint16_t 543 tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum) 544 { 545 uint32_t c; 546 uint16_t cs; 547 548 c = tcp_csum; 549 550 switch (pa->data.lro_type) { 551 #ifdef INET6 552 case LRO_TYPE_IPV6_TCP: 553 /* Compute full pseudo IPv6 header checksum. */ 554 cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0); 555 break; 556 #endif 557 #ifdef INET 558 case LRO_TYPE_IPV4_TCP: 559 /* Compute full pseudo IPv4 header checsum. */ 560 cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP); 561 cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs)); 562 break; 563 #endif 564 default: 565 cs = 0; /* Keep compiler happy. */ 566 break; 567 } 568 569 /* Complement checksum. */ 570 cs = ~cs; 571 c += cs; 572 573 /* Remove TCP header checksum. */ 574 cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp); 575 c += cs; 576 577 /* Compute checksum remainder. */ 578 while (c > 0xffff) 579 c = (c >> 16) + (c & 0xffff); 580 581 return (c); 582 } 583 584 static void 585 tcp_lro_rx_done(struct lro_ctrl *lc) 586 { 587 struct lro_entry *le; 588 589 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 590 tcp_lro_active_remove(le); 591 tcp_lro_flush(lc, le); 592 } 593 } 594 595 static void 596 tcp_lro_flush_active(struct lro_ctrl *lc) 597 { 598 struct lro_entry *le, *le_tmp; 599 600 /* 601 * Walk through the list of le entries, and 602 * any one that does have packets flush. This 603 * is called because we have an inbound packet 604 * (e.g. SYN) that has to have all others flushed 605 * in front of it. Note we have to do the remove 606 * because tcp_lro_flush() assumes that the entry 607 * is being freed. This is ok it will just get 608 * reallocated again like it was new. 609 */ 610 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 611 if (le->m_head != NULL) { 612 tcp_lro_active_remove(le); 613 tcp_lro_flush(lc, le); 614 } 615 } 616 } 617 618 void 619 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 620 { 621 struct lro_entry *le, *le_tmp; 622 uint64_t now, tov; 623 struct bintime bt; 624 625 NET_EPOCH_ASSERT(); 626 if (LIST_EMPTY(&lc->lro_active)) 627 return; 628 629 /* get timeout time and current time in ns */ 630 binuptime(&bt); 631 now = bintime2ns(&bt); 632 tov = ((timeout->tv_sec * 1000000000) + (timeout->tv_usec * 1000)); 633 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 634 if (now >= (bintime2ns(&le->alloc_time) + tov)) { 635 tcp_lro_active_remove(le); 636 tcp_lro_flush(lc, le); 637 } 638 } 639 } 640 641 #ifdef INET 642 static int 643 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4) 644 { 645 uint16_t csum; 646 647 /* Legacy IP has a header checksum that needs to be correct. */ 648 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 649 if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) { 650 lc->lro_bad_csum++; 651 return (TCP_LRO_CANNOT); 652 } 653 } else { 654 csum = in_cksum_hdr(ip4); 655 if (__predict_false(csum != 0)) { 656 lc->lro_bad_csum++; 657 return (TCP_LRO_CANNOT); 658 } 659 } 660 return (0); 661 } 662 #endif 663 664 static inline void 665 tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum) 666 { 667 uint32_t csum; 668 669 csum = 0xffff - *ptr + value; 670 while (csum > 0xffff) 671 csum = (csum >> 16) + (csum & 0xffff); 672 *ptr = value; 673 *psum = csum; 674 } 675 676 static uint16_t 677 tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le, 678 uint16_t payload_len, uint16_t delta_sum) 679 { 680 uint32_t csum; 681 uint16_t tlen; 682 uint16_t temp[5] = {}; 683 684 switch (pa->data.lro_type) { 685 case LRO_TYPE_IPV4_TCP: 686 /* Compute new IPv4 length. */ 687 tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len; 688 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); 689 690 /* Subtract delta from current IPv4 checksum. */ 691 csum = pa->ip4->ip_sum + 0xffff - temp[0]; 692 while (csum > 0xffff) 693 csum = (csum >> 16) + (csum & 0xffff); 694 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); 695 goto update_tcp_header; 696 697 case LRO_TYPE_IPV6_TCP: 698 /* Compute new IPv6 length. */ 699 tlen = (pa->tcp->th_off << 2) + payload_len; 700 tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); 701 goto update_tcp_header; 702 703 case LRO_TYPE_IPV4_UDP: 704 /* Compute new IPv4 length. */ 705 tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len; 706 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); 707 708 /* Subtract delta from current IPv4 checksum. */ 709 csum = pa->ip4->ip_sum + 0xffff - temp[0]; 710 while (csum > 0xffff) 711 csum = (csum >> 16) + (csum & 0xffff); 712 tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); 713 goto update_udp_header; 714 715 case LRO_TYPE_IPV6_UDP: 716 /* Compute new IPv6 length. */ 717 tlen = sizeof(*pa->udp) + payload_len; 718 tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); 719 goto update_udp_header; 720 721 default: 722 return (0); 723 } 724 725 update_tcp_header: 726 /* Compute current TCP header checksum. */ 727 temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp); 728 729 /* Incorporate the latest ACK into the TCP header. */ 730 pa->tcp->th_ack = le->ack_seq; 731 pa->tcp->th_win = le->window; 732 733 /* Incorporate latest timestamp into the TCP header. */ 734 if (le->timestamp != 0) { 735 uint32_t *ts_ptr; 736 737 ts_ptr = (uint32_t *)(pa->tcp + 1); 738 ts_ptr[1] = htonl(le->tsval); 739 ts_ptr[2] = le->tsecr; 740 } 741 742 /* Compute new TCP header checksum. */ 743 temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp); 744 745 /* Compute new TCP checksum. */ 746 csum = pa->tcp->th_sum + 0xffff - delta_sum + 747 0xffff - temp[0] + 0xffff - temp[3] + temp[2]; 748 while (csum > 0xffff) 749 csum = (csum >> 16) + (csum & 0xffff); 750 751 /* Assign new TCP checksum. */ 752 tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]); 753 754 /* Compute all modififications affecting next checksum. */ 755 csum = temp[0] + temp[1] + 0xffff - temp[2] + 756 temp[3] + temp[4] + delta_sum; 757 while (csum > 0xffff) 758 csum = (csum >> 16) + (csum & 0xffff); 759 760 /* Return delta checksum to next stage, if any. */ 761 return (csum); 762 763 update_udp_header: 764 tlen = sizeof(*pa->udp) + payload_len; 765 /* Assign new UDP length and compute checksum delta. */ 766 tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]); 767 768 /* Check if there is a UDP checksum. */ 769 if (__predict_false(pa->udp->uh_sum != 0)) { 770 /* Compute new UDP checksum. */ 771 csum = pa->udp->uh_sum + 0xffff - delta_sum + 772 0xffff - temp[0] + 0xffff - temp[2]; 773 while (csum > 0xffff) 774 csum = (csum >> 16) + (csum & 0xffff); 775 /* Assign new UDP checksum. */ 776 tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]); 777 } 778 779 /* Compute all modififications affecting next checksum. */ 780 csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum; 781 while (csum > 0xffff) 782 csum = (csum >> 16) + (csum & 0xffff); 783 784 /* Return delta checksum to next stage, if any. */ 785 return (csum); 786 } 787 788 static void 789 tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le) 790 { 791 /* Check if we need to recompute any checksums. */ 792 if (le->needs_merge) { 793 uint16_t csum; 794 795 switch (le->inner.data.lro_type) { 796 case LRO_TYPE_IPV4_TCP: 797 csum = tcp_lro_update_checksum(&le->inner, le, 798 le->m_head->m_pkthdr.lro_tcp_d_len, 799 le->m_head->m_pkthdr.lro_tcp_d_csum); 800 csum = tcp_lro_update_checksum(&le->outer, NULL, 801 le->m_head->m_pkthdr.lro_tcp_d_len + 802 le->inner.total_hdr_len, csum); 803 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 804 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 805 le->m_head->m_pkthdr.csum_data = 0xffff; 806 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 807 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 808 break; 809 case LRO_TYPE_IPV6_TCP: 810 csum = tcp_lro_update_checksum(&le->inner, le, 811 le->m_head->m_pkthdr.lro_tcp_d_len, 812 le->m_head->m_pkthdr.lro_tcp_d_csum); 813 csum = tcp_lro_update_checksum(&le->outer, NULL, 814 le->m_head->m_pkthdr.lro_tcp_d_len + 815 le->inner.total_hdr_len, csum); 816 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 817 CSUM_PSEUDO_HDR; 818 le->m_head->m_pkthdr.csum_data = 0xffff; 819 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 820 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 821 break; 822 case LRO_TYPE_NONE: 823 switch (le->outer.data.lro_type) { 824 case LRO_TYPE_IPV4_TCP: 825 csum = tcp_lro_update_checksum(&le->outer, le, 826 le->m_head->m_pkthdr.lro_tcp_d_len, 827 le->m_head->m_pkthdr.lro_tcp_d_csum); 828 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 829 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 830 le->m_head->m_pkthdr.csum_data = 0xffff; 831 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 832 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 833 break; 834 case LRO_TYPE_IPV6_TCP: 835 csum = tcp_lro_update_checksum(&le->outer, le, 836 le->m_head->m_pkthdr.lro_tcp_d_len, 837 le->m_head->m_pkthdr.lro_tcp_d_csum); 838 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 839 CSUM_PSEUDO_HDR; 840 le->m_head->m_pkthdr.csum_data = 0xffff; 841 if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) 842 le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; 843 break; 844 default: 845 break; 846 } 847 break; 848 default: 849 break; 850 } 851 } 852 853 /* 854 * Break any chain, this is not set to NULL on the singleton 855 * case m_nextpkt points to m_head. Other case set them 856 * m_nextpkt to NULL in push_and_replace. 857 */ 858 le->m_head->m_nextpkt = NULL; 859 lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs; 860 (*lc->ifp->if_input)(lc->ifp, le->m_head); 861 } 862 863 static void 864 tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le, 865 struct mbuf *m, struct tcphdr *th) 866 { 867 uint32_t *ts_ptr; 868 uint16_t tcp_data_len; 869 uint16_t tcp_opt_len; 870 871 ts_ptr = (uint32_t *)(th + 1); 872 tcp_opt_len = (th->th_off << 2); 873 tcp_opt_len -= sizeof(*th); 874 875 /* Check if there is a timestamp option. */ 876 if (tcp_opt_len == 0 || 877 __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || 878 *ts_ptr != TCP_LRO_TS_OPTION)) { 879 /* We failed to find the timestamp option. */ 880 le->timestamp = 0; 881 } else { 882 le->timestamp = 1; 883 le->tsval = ntohl(*(ts_ptr + 1)); 884 le->tsecr = *(ts_ptr + 2); 885 } 886 887 tcp_data_len = m->m_pkthdr.lro_tcp_d_len; 888 889 /* Pull out TCP sequence numbers and window size. */ 890 le->next_seq = ntohl(th->th_seq) + tcp_data_len; 891 le->ack_seq = th->th_ack; 892 le->window = th->th_win; 893 le->flags = tcp_get_flags(th); 894 le->needs_merge = 0; 895 896 /* Setup new data pointers. */ 897 le->m_head = m; 898 le->m_tail = m_last(m); 899 } 900 901 static void 902 tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 903 { 904 struct lro_parser *pa; 905 906 /* 907 * Push up the stack of the current entry 908 * and replace it with "m". 909 */ 910 struct mbuf *msave; 911 912 /* Grab off the next and save it */ 913 msave = le->m_head->m_nextpkt; 914 le->m_head->m_nextpkt = NULL; 915 916 /* Now push out the old entry */ 917 tcp_flush_out_entry(lc, le); 918 919 /* Re-parse new header, should not fail. */ 920 pa = tcp_lro_parser(m, &le->outer, &le->inner, false); 921 KASSERT(pa != NULL, 922 ("tcp_push_and_replace: LRO parser failed on m=%p\n", m)); 923 924 /* 925 * Now to replace the data properly in the entry 926 * we have to reset the TCP header and 927 * other fields. 928 */ 929 tcp_set_entry_to_mbuf(lc, le, m, pa->tcp); 930 931 /* Restore the next list */ 932 m->m_nextpkt = msave; 933 } 934 935 static void 936 tcp_lro_mbuf_append_pkthdr(struct lro_entry *le, const struct mbuf *p) 937 { 938 struct mbuf *m; 939 uint32_t csum; 940 941 m = le->m_head; 942 if (m->m_pkthdr.lro_nsegs == 1) { 943 /* Compute relative checksum. */ 944 csum = p->m_pkthdr.lro_tcp_d_csum; 945 } else { 946 /* Merge TCP data checksums. */ 947 csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum + 948 (uint32_t)p->m_pkthdr.lro_tcp_d_csum; 949 while (csum > 0xffff) 950 csum = (csum >> 16) + (csum & 0xffff); 951 } 952 953 /* Update various counters. */ 954 m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len; 955 m->m_pkthdr.lro_tcp_d_csum = csum; 956 m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len; 957 m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs; 958 le->needs_merge = 1; 959 } 960 961 static void 962 tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le) 963 { 964 /* 965 * Walk through the mbuf chain we 966 * have on tap and compress/condense 967 * as required. 968 */ 969 uint32_t *ts_ptr; 970 struct mbuf *m; 971 struct tcphdr *th; 972 uint32_t tcp_data_len_total; 973 uint32_t tcp_data_seg_total; 974 uint16_t tcp_data_len; 975 uint16_t tcp_opt_len; 976 977 /* 978 * First we must check the lead (m_head) 979 * we must make sure that it is *not* 980 * something that should be sent up 981 * right away (sack etc). 982 */ 983 again: 984 m = le->m_head->m_nextpkt; 985 if (m == NULL) { 986 /* Just one left. */ 987 return; 988 } 989 990 th = tcp_lro_get_th(m); 991 tcp_opt_len = (th->th_off << 2); 992 tcp_opt_len -= sizeof(*th); 993 ts_ptr = (uint32_t *)(th + 1); 994 995 if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || 996 *ts_ptr != TCP_LRO_TS_OPTION)) { 997 /* 998 * Its not the timestamp. We can't 999 * use this guy as the head. 1000 */ 1001 le->m_head->m_nextpkt = m->m_nextpkt; 1002 tcp_push_and_replace(lc, le, m); 1003 goto again; 1004 } 1005 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) { 1006 /* 1007 * Make sure that previously seen segments/ACKs are delivered 1008 * before this segment, e.g. FIN. 1009 */ 1010 le->m_head->m_nextpkt = m->m_nextpkt; 1011 tcp_push_and_replace(lc, le, m); 1012 goto again; 1013 } 1014 while((m = le->m_head->m_nextpkt) != NULL) { 1015 /* 1016 * condense m into le, first 1017 * pull m out of the list. 1018 */ 1019 le->m_head->m_nextpkt = m->m_nextpkt; 1020 m->m_nextpkt = NULL; 1021 /* Setup my data */ 1022 tcp_data_len = m->m_pkthdr.lro_tcp_d_len; 1023 th = tcp_lro_get_th(m); 1024 ts_ptr = (uint32_t *)(th + 1); 1025 tcp_opt_len = (th->th_off << 2); 1026 tcp_opt_len -= sizeof(*th); 1027 tcp_data_len_total = le->m_head->m_pkthdr.lro_tcp_d_len + tcp_data_len; 1028 tcp_data_seg_total = le->m_head->m_pkthdr.lro_nsegs + m->m_pkthdr.lro_nsegs; 1029 1030 if (tcp_data_seg_total >= lc->lro_ackcnt_lim || 1031 tcp_data_len_total >= lc->lro_length_lim) { 1032 /* Flush now if appending will result in overflow. */ 1033 tcp_push_and_replace(lc, le, m); 1034 goto again; 1035 } 1036 if (tcp_opt_len != 0 && 1037 __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || 1038 *ts_ptr != TCP_LRO_TS_OPTION)) { 1039 /* 1040 * Maybe a sack in the new one? We need to 1041 * start all over after flushing the 1042 * current le. We will go up to the beginning 1043 * and flush it (calling the replace again possibly 1044 * or just returning). 1045 */ 1046 tcp_push_and_replace(lc, le, m); 1047 goto again; 1048 } 1049 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) { 1050 tcp_push_and_replace(lc, le, m); 1051 goto again; 1052 } 1053 if (tcp_opt_len != 0) { 1054 uint32_t tsval = ntohl(*(ts_ptr + 1)); 1055 /* Make sure timestamp values are increasing. */ 1056 if (TSTMP_GT(le->tsval, tsval)) { 1057 tcp_push_and_replace(lc, le, m); 1058 goto again; 1059 } 1060 le->tsval = tsval; 1061 le->tsecr = *(ts_ptr + 2); 1062 } 1063 /* Try to append the new segment. */ 1064 if (__predict_false(ntohl(th->th_seq) != le->next_seq || 1065 ((tcp_get_flags(th) & TH_ACK) != 1066 (le->flags & TH_ACK)) || 1067 (tcp_data_len == 0 && 1068 le->ack_seq == th->th_ack && 1069 le->window == th->th_win))) { 1070 /* Out of order packet, non-ACK + ACK or dup ACK. */ 1071 tcp_push_and_replace(lc, le, m); 1072 goto again; 1073 } 1074 if (tcp_data_len != 0 || 1075 SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 1076 le->next_seq += tcp_data_len; 1077 le->ack_seq = th->th_ack; 1078 le->window = th->th_win; 1079 le->needs_merge = 1; 1080 } else if (th->th_ack == le->ack_seq) { 1081 if (WIN_GT(th->th_win, le->window)) { 1082 le->window = th->th_win; 1083 le->needs_merge = 1; 1084 } 1085 } 1086 1087 if (tcp_data_len == 0) { 1088 m_freem(m); 1089 continue; 1090 } 1091 1092 /* Merge TCP data checksum and length to head mbuf. */ 1093 tcp_lro_mbuf_append_pkthdr(le, m); 1094 1095 /* 1096 * Adjust the mbuf so that m_data points to the first byte of 1097 * the ULP payload. Adjust the mbuf to avoid complications and 1098 * append new segment to existing mbuf chain. 1099 */ 1100 m_adj(m, m->m_pkthdr.len - tcp_data_len); 1101 m_demote_pkthdr(m); 1102 le->m_tail->m_next = m; 1103 le->m_tail = m_last(m); 1104 } 1105 } 1106 1107 void 1108 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 1109 { 1110 1111 /* Only optimise if there are multiple packets waiting. */ 1112 NET_EPOCH_ASSERT(); 1113 if (tcp_lro_flush_tcphpts == NULL || 1114 tcp_lro_flush_tcphpts(lc, le) != 0) { 1115 tcp_lro_condense(lc, le); 1116 tcp_flush_out_entry(lc, le); 1117 } 1118 lc->lro_flushed++; 1119 bzero(le, sizeof(*le)); 1120 LIST_INSERT_HEAD(&lc->lro_free, le, next); 1121 } 1122 1123 #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) 1124 1125 /* 1126 * The tcp_lro_sort() routine is comparable to qsort(), except it has 1127 * a worst case complexity limit of O(MIN(N,64)*N), where N is the 1128 * number of elements to sort and 64 is the number of sequence bits 1129 * available. The algorithm is bit-slicing the 64-bit sequence number, 1130 * sorting one bit at a time from the most significant bit until the 1131 * least significant one, skipping the constant bits. This is 1132 * typically called a radix sort. 1133 */ 1134 static void 1135 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) 1136 { 1137 struct lro_mbuf_sort temp; 1138 uint64_t ones; 1139 uint64_t zeros; 1140 uint32_t x; 1141 uint32_t y; 1142 1143 repeat: 1144 /* for small arrays insertion sort is faster */ 1145 if (size <= 12) { 1146 for (x = 1; x < size; x++) { 1147 temp = parray[x]; 1148 for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) 1149 parray[y] = parray[y - 1]; 1150 parray[y] = temp; 1151 } 1152 return; 1153 } 1154 1155 /* compute sequence bits which are constant */ 1156 ones = 0; 1157 zeros = 0; 1158 for (x = 0; x != size; x++) { 1159 ones |= parray[x].seq; 1160 zeros |= ~parray[x].seq; 1161 } 1162 1163 /* compute bits which are not constant into "ones" */ 1164 ones &= zeros; 1165 if (ones == 0) 1166 return; 1167 1168 /* pick the most significant bit which is not constant */ 1169 ones = tcp_lro_msb_64(ones); 1170 1171 /* 1172 * Move entries having cleared sequence bits to the beginning 1173 * of the array: 1174 */ 1175 for (x = y = 0; y != size; y++) { 1176 /* skip set bits */ 1177 if (parray[y].seq & ones) 1178 continue; 1179 /* swap entries */ 1180 temp = parray[x]; 1181 parray[x] = parray[y]; 1182 parray[y] = temp; 1183 x++; 1184 } 1185 1186 KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); 1187 1188 /* sort zeros */ 1189 tcp_lro_sort(parray, x); 1190 1191 /* sort ones */ 1192 parray += x; 1193 size -= x; 1194 goto repeat; 1195 } 1196 1197 void 1198 tcp_lro_flush_all(struct lro_ctrl *lc) 1199 { 1200 uint64_t seq; 1201 uint64_t nseq; 1202 unsigned x; 1203 1204 NET_EPOCH_ASSERT(); 1205 /* check if no mbufs to flush */ 1206 if (lc->lro_mbuf_count == 0) 1207 goto done; 1208 if (lc->lro_cpu_is_set == 0) { 1209 if (lc->lro_last_cpu == curcpu) { 1210 lc->lro_cnt_of_same_cpu++; 1211 /* Have we reached the threshold to declare a cpu? */ 1212 if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh) 1213 lc->lro_cpu_is_set = 1; 1214 } else { 1215 lc->lro_last_cpu = curcpu; 1216 lc->lro_cnt_of_same_cpu = 0; 1217 } 1218 } 1219 CURVNET_SET(lc->ifp->if_vnet); 1220 1221 /* get current time */ 1222 binuptime(&lc->lro_last_queue_time); 1223 1224 /* sort all mbufs according to stream */ 1225 tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); 1226 1227 /* input data into LRO engine, stream by stream */ 1228 seq = 0; 1229 for (x = 0; x != lc->lro_mbuf_count; x++) { 1230 struct mbuf *mb; 1231 1232 /* get mbuf */ 1233 mb = lc->lro_mbuf_data[x].mb; 1234 1235 /* get sequence number, masking away the packet index */ 1236 nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); 1237 1238 /* check for new stream */ 1239 if (seq != nseq) { 1240 seq = nseq; 1241 1242 /* flush active streams */ 1243 tcp_lro_rx_done(lc); 1244 } 1245 1246 /* add packet to LRO engine */ 1247 if (tcp_lro_rx_common(lc, mb, 0, false) != 0) { 1248 /* Flush anything we have acummulated */ 1249 tcp_lro_flush_active(lc); 1250 /* input packet to network layer */ 1251 (*lc->ifp->if_input)(lc->ifp, mb); 1252 lc->lro_queued++; 1253 lc->lro_flushed++; 1254 } 1255 } 1256 CURVNET_RESTORE(); 1257 done: 1258 /* flush active streams */ 1259 tcp_lro_rx_done(lc); 1260 tcp_hpts_softclock(); 1261 lc->lro_mbuf_count = 0; 1262 } 1263 1264 static struct lro_head * 1265 tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser) 1266 { 1267 u_long hash; 1268 1269 if (M_HASHTYPE_ISHASH(m)) { 1270 hash = m->m_pkthdr.flowid; 1271 } else { 1272 for (unsigned i = hash = 0; i != LRO_RAW_ADDRESS_MAX; i++) 1273 hash += parser->data.raw[i]; 1274 } 1275 return (&lc->lro_hash[hash % lc->lro_hashsz]); 1276 } 1277 1278 static int 1279 tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash) 1280 { 1281 struct lro_parser pi; /* inner address data */ 1282 struct lro_parser po; /* outer address data */ 1283 struct lro_parser *pa; /* current parser for TCP stream */ 1284 struct lro_entry *le; 1285 struct lro_head *bucket; 1286 struct tcphdr *th; 1287 int tcp_data_len; 1288 int tcp_opt_len; 1289 int error; 1290 uint16_t tcp_data_sum; 1291 1292 #ifdef INET 1293 /* Quickly decide if packet cannot be LRO'ed */ 1294 if (__predict_false(V_ipforwarding != 0)) 1295 return (TCP_LRO_CANNOT); 1296 #endif 1297 #ifdef INET6 1298 /* Quickly decide if packet cannot be LRO'ed */ 1299 if (__predict_false(V_ip6_forwarding != 0)) 1300 return (TCP_LRO_CANNOT); 1301 #endif 1302 if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != 1303 ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 1304 (m->m_pkthdr.csum_data != 0xffff)) { 1305 /* 1306 * The checksum either did not have hardware offload 1307 * or it was a bad checksum. We can't LRO such 1308 * a packet. 1309 */ 1310 counter_u64_add(tcp_bad_csums, 1); 1311 return (TCP_LRO_CANNOT); 1312 } 1313 /* We expect a contiguous header [eh, ip, tcp]. */ 1314 pa = tcp_lro_parser(m, &po, &pi, true); 1315 if (__predict_false(pa == NULL)) 1316 return (TCP_LRO_NOT_SUPPORTED); 1317 1318 /* We don't expect any padding. */ 1319 error = tcp_lro_trim_mbuf_chain(m, pa); 1320 if (__predict_false(error != 0)) 1321 return (error); 1322 1323 #ifdef INET 1324 switch (pa->data.lro_type) { 1325 case LRO_TYPE_IPV4_TCP: 1326 error = tcp_lro_rx_ipv4(lc, m, pa->ip4); 1327 if (__predict_false(error != 0)) 1328 return (error); 1329 break; 1330 default: 1331 break; 1332 } 1333 #endif 1334 /* If no hardware or arrival stamp on the packet add timestamp */ 1335 if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) { 1336 m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 1337 m->m_flags |= M_TSTMP_LRO; 1338 } 1339 1340 /* Get pointer to TCP header. */ 1341 th = pa->tcp; 1342 1343 /* Don't process SYN packets. */ 1344 if (__predict_false(tcp_get_flags(th) & TH_SYN)) 1345 return (TCP_LRO_CANNOT); 1346 1347 /* Get total TCP header length and compute payload length. */ 1348 tcp_opt_len = (th->th_off << 2); 1349 tcp_data_len = m->m_pkthdr.len - ((uint8_t *)th - 1350 (uint8_t *)m->m_data) - tcp_opt_len; 1351 tcp_opt_len -= sizeof(*th); 1352 1353 /* Don't process invalid TCP headers. */ 1354 if (__predict_false(tcp_opt_len < 0 || tcp_data_len < 0)) 1355 return (TCP_LRO_CANNOT); 1356 1357 /* Compute TCP data only checksum. */ 1358 if (tcp_data_len == 0) 1359 tcp_data_sum = 0; /* no data, no checksum */ 1360 else if (__predict_false(csum != 0)) 1361 tcp_data_sum = tcp_lro_rx_csum_data(pa, ~csum); 1362 else 1363 tcp_data_sum = tcp_lro_rx_csum_data(pa, ~th->th_sum); 1364 1365 /* Save TCP info in mbuf. */ 1366 m->m_nextpkt = NULL; 1367 m->m_pkthdr.rcvif = lc->ifp; 1368 m->m_pkthdr.lro_tcp_d_csum = tcp_data_sum; 1369 m->m_pkthdr.lro_tcp_d_len = tcp_data_len; 1370 m->m_pkthdr.lro_tcp_h_off = ((uint8_t *)th - (uint8_t *)m->m_data); 1371 m->m_pkthdr.lro_nsegs = 1; 1372 1373 /* Get hash bucket. */ 1374 if (!use_hash) { 1375 bucket = &lc->lro_hash[0]; 1376 } else { 1377 bucket = tcp_lro_rx_get_bucket(lc, m, pa); 1378 } 1379 1380 /* Try to find a matching previous segment. */ 1381 LIST_FOREACH(le, bucket, hash_next) { 1382 /* Compare addresses and ports. */ 1383 if (lro_address_compare(&po.data, &le->outer.data) == false || 1384 lro_address_compare(&pi.data, &le->inner.data) == false) 1385 continue; 1386 1387 /* Check if no data and old ACK. */ 1388 if (tcp_data_len == 0 && 1389 SEQ_LT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 1390 m_freem(m); 1391 return (0); 1392 } 1393 1394 /* Mark "m" in the last spot. */ 1395 le->m_last_mbuf->m_nextpkt = m; 1396 /* Now set the tail to "m". */ 1397 le->m_last_mbuf = m; 1398 return (0); 1399 } 1400 1401 /* Try to find an empty slot. */ 1402 if (LIST_EMPTY(&lc->lro_free)) 1403 return (TCP_LRO_NO_ENTRIES); 1404 1405 /* Start a new segment chain. */ 1406 le = LIST_FIRST(&lc->lro_free); 1407 LIST_REMOVE(le, next); 1408 tcp_lro_active_insert(lc, bucket, le); 1409 1410 /* Make sure the headers are set. */ 1411 le->inner = pi; 1412 le->outer = po; 1413 1414 /* Store time this entry was allocated. */ 1415 le->alloc_time = lc->lro_last_queue_time; 1416 1417 tcp_set_entry_to_mbuf(lc, le, m, th); 1418 1419 /* Now set the tail to "m". */ 1420 le->m_last_mbuf = m; 1421 1422 return (0); 1423 } 1424 1425 int 1426 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 1427 { 1428 int error; 1429 1430 if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != 1431 ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 1432 (m->m_pkthdr.csum_data != 0xffff)) { 1433 /* 1434 * The checksum either did not have hardware offload 1435 * or it was a bad checksum. We can't LRO such 1436 * a packet. 1437 */ 1438 counter_u64_add(tcp_bad_csums, 1); 1439 return (TCP_LRO_CANNOT); 1440 } 1441 /* get current time */ 1442 binuptime(&lc->lro_last_queue_time); 1443 CURVNET_SET(lc->ifp->if_vnet); 1444 error = tcp_lro_rx_common(lc, m, csum, true); 1445 if (__predict_false(error != 0)) { 1446 /* 1447 * Flush anything we have acummulated 1448 * ahead of this packet that can't 1449 * be LRO'd. This preserves order. 1450 */ 1451 tcp_lro_flush_active(lc); 1452 } 1453 CURVNET_RESTORE(); 1454 1455 return (error); 1456 } 1457 1458 void 1459 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) 1460 { 1461 NET_EPOCH_ASSERT(); 1462 /* sanity checks */ 1463 if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || 1464 lc->lro_mbuf_max == 0)) { 1465 /* packet drop */ 1466 m_freem(mb); 1467 return; 1468 } 1469 1470 /* check if packet is not LRO capable */ 1471 if (__predict_false((lc->ifp->if_capenable & IFCAP_LRO) == 0)) { 1472 /* input packet to network layer */ 1473 (*lc->ifp->if_input) (lc->ifp, mb); 1474 return; 1475 } 1476 1477 /* If no hardware or arrival stamp on the packet add timestamp */ 1478 if ((tcplro_stacks_wanting_mbufq > 0) && 1479 (tcp_less_accurate_lro_ts == 0) && 1480 ((mb->m_flags & M_TSTMP) == 0)) { 1481 /* Add in an LRO time since no hardware */ 1482 binuptime(&lc->lro_last_queue_time); 1483 mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 1484 mb->m_flags |= M_TSTMP_LRO; 1485 } 1486 1487 /* create sequence number */ 1488 lc->lro_mbuf_data[lc->lro_mbuf_count].seq = 1489 (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | 1490 (((uint64_t)mb->m_pkthdr.flowid) << 24) | 1491 ((uint64_t)lc->lro_mbuf_count); 1492 1493 /* enter mbuf */ 1494 lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; 1495 1496 /* flush if array is full */ 1497 if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) 1498 tcp_lro_flush_all(lc); 1499 } 1500 1501 /* end */ 1502