1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007, Myricom Inc. 5 * Copyright (c) 2008, Intel Corporation. 6 * Copyright (c) 2012 The FreeBSD Foundation 7 * Copyright (c) 2016 Mellanox Technologies. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Bjoern Zeeb 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_inet.h" 39 #include "opt_inet6.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sockbuf.h> 49 #include <sys/sysctl.h> 50 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <net/ethernet.h> 54 #include <net/bpf.h> 55 #include <net/vnet.h> 56 57 #include <netinet/in_systm.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/ip.h> 61 #include <netinet/ip_var.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet6/in6_pcb.h> 64 #include <netinet/tcp.h> 65 #include <netinet/tcp_seq.h> 66 #include <netinet/tcp_lro.h> 67 #include <netinet/tcp_var.h> 68 #include <netinet/tcpip.h> 69 #include <netinet/tcp_hpts.h> 70 #include <netinet/tcp_log_buf.h> 71 #include <netinet6/ip6_var.h> 72 73 #include <machine/in_cksum.h> 74 75 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); 76 77 #define TCP_LRO_UPDATE_CSUM 1 78 #ifndef TCP_LRO_UPDATE_CSUM 79 #define TCP_LRO_INVALID_CSUM 0x0000 80 #endif 81 82 static void tcp_lro_rx_done(struct lro_ctrl *lc); 83 static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, 84 uint32_t csum, int use_hash); 85 86 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 87 "TCP LRO"); 88 89 static long tcplro_stacks_wanting_mbufq = 0; 90 counter_u64_t tcp_inp_lro_direct_queue; 91 counter_u64_t tcp_inp_lro_wokeup_queue; 92 counter_u64_t tcp_inp_lro_compressed; 93 counter_u64_t tcp_inp_lro_single_push; 94 counter_u64_t tcp_inp_lro_locks_taken; 95 counter_u64_t tcp_inp_lro_sack_wake; 96 counter_u64_t tcp_extra_mbuf; 97 counter_u64_t tcp_would_have_but; 98 counter_u64_t tcp_comp_total; 99 counter_u64_t tcp_uncomp_total; 100 counter_u64_t tcp_csum_hardware; 101 counter_u64_t tcp_csum_hardware_w_ph; 102 counter_u64_t tcp_csum_software; 103 104 105 static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; 106 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, 107 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, 108 "default number of LRO entries"); 109 110 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, 111 &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); 112 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, 113 &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); 114 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, 115 &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); 116 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD, 117 &tcp_inp_lro_single_push, "Number of lro's sent with single segment"); 118 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, 119 &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); 120 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD, 121 &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin"); 122 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, 123 &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); 124 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, 125 &tcp_would_have_but, "Number of times we would have had an extra compressed but out of room"); 126 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, 127 &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); 128 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, 129 &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); 130 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw, CTLFLAG_RD, 131 &tcp_csum_hardware, "Number of checksums processed in hardware"); 132 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw_ph, CTLFLAG_RD, 133 &tcp_csum_hardware_w_ph, "Number of checksums processed in hardware with pseudo header"); 134 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_sw, CTLFLAG_RD, 135 &tcp_csum_software, "Number of checksums processed in software"); 136 137 138 139 void 140 tcp_lro_reg_mbufq(void) 141 { 142 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); 143 } 144 145 void 146 tcp_lro_dereg_mbufq(void) 147 { 148 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); 149 } 150 151 static __inline void 152 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, 153 struct lro_entry *le) 154 { 155 156 LIST_INSERT_HEAD(&lc->lro_active, le, next); 157 LIST_INSERT_HEAD(bucket, le, hash_next); 158 } 159 160 static __inline void 161 tcp_lro_active_remove(struct lro_entry *le) 162 { 163 164 LIST_REMOVE(le, next); /* active list */ 165 LIST_REMOVE(le, hash_next); /* hash bucket */ 166 } 167 168 int 169 tcp_lro_init(struct lro_ctrl *lc) 170 { 171 return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); 172 } 173 174 int 175 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, 176 unsigned lro_entries, unsigned lro_mbufs) 177 { 178 struct lro_entry *le; 179 size_t size; 180 unsigned i, elements; 181 182 lc->lro_bad_csum = 0; 183 lc->lro_queued = 0; 184 lc->lro_flushed = 0; 185 lc->lro_mbuf_count = 0; 186 lc->lro_mbuf_max = lro_mbufs; 187 lc->lro_cnt = lro_entries; 188 lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; 189 lc->lro_length_lim = TCP_LRO_LENGTH_MAX; 190 lc->ifp = ifp; 191 LIST_INIT(&lc->lro_free); 192 LIST_INIT(&lc->lro_active); 193 194 /* create hash table to accelerate entry lookup */ 195 if (lro_entries > lro_mbufs) 196 elements = lro_entries; 197 else 198 elements = lro_mbufs; 199 lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, 200 HASH_NOWAIT); 201 if (lc->lro_hash == NULL) { 202 memset(lc, 0, sizeof(*lc)); 203 return (ENOMEM); 204 } 205 206 /* compute size to allocate */ 207 size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + 208 (lro_entries * sizeof(*le)); 209 lc->lro_mbuf_data = (struct lro_mbuf_sort *) 210 malloc(size, M_LRO, M_NOWAIT | M_ZERO); 211 212 /* check for out of memory */ 213 if (lc->lro_mbuf_data == NULL) { 214 free(lc->lro_hash, M_LRO); 215 memset(lc, 0, sizeof(*lc)); 216 return (ENOMEM); 217 } 218 /* compute offset for LRO entries */ 219 le = (struct lro_entry *) 220 (lc->lro_mbuf_data + lro_mbufs); 221 222 /* setup linked list */ 223 for (i = 0; i != lro_entries; i++) 224 LIST_INSERT_HEAD(&lc->lro_free, le + i, next); 225 226 return (0); 227 } 228 229 static struct tcphdr * 230 tcp_lro_get_th(struct lro_entry *le, struct mbuf *m) 231 { 232 struct ether_header *eh; 233 struct tcphdr *th = NULL; 234 #ifdef INET6 235 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 236 #endif 237 #ifdef INET 238 struct ip *ip4 = NULL; /* Keep compiler happy. */ 239 #endif 240 241 eh = mtod(m, struct ether_header *); 242 switch (le->eh_type) { 243 #ifdef INET6 244 case ETHERTYPE_IPV6: 245 ip6 = (struct ip6_hdr *)(eh + 1); 246 th = (struct tcphdr *)(ip6 + 1); 247 break; 248 #endif 249 #ifdef INET 250 case ETHERTYPE_IP: 251 ip4 = (struct ip *)(eh + 1); 252 th = (struct tcphdr *)(ip4 + 1); 253 break; 254 #endif 255 } 256 return (th); 257 } 258 259 static void 260 lro_free_mbuf_chain(struct mbuf *m) 261 { 262 struct mbuf *save; 263 264 while (m) { 265 save = m->m_nextpkt; 266 m->m_nextpkt = NULL; 267 m_freem(m); 268 m = save; 269 } 270 } 271 272 void 273 tcp_lro_free(struct lro_ctrl *lc) 274 { 275 struct lro_entry *le; 276 unsigned x; 277 278 /* reset LRO free list */ 279 LIST_INIT(&lc->lro_free); 280 281 /* free active mbufs, if any */ 282 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 283 tcp_lro_active_remove(le); 284 lro_free_mbuf_chain(le->m_head); 285 } 286 287 /* free hash table */ 288 free(lc->lro_hash, M_LRO); 289 lc->lro_hash = NULL; 290 lc->lro_hashsz = 0; 291 292 /* free mbuf array, if any */ 293 for (x = 0; x != lc->lro_mbuf_count; x++) 294 m_freem(lc->lro_mbuf_data[x].mb); 295 lc->lro_mbuf_count = 0; 296 297 /* free allocated memory, if any */ 298 free(lc->lro_mbuf_data, M_LRO); 299 lc->lro_mbuf_data = NULL; 300 } 301 302 static uint16_t 303 tcp_lro_csum_th(struct tcphdr *th) 304 { 305 uint32_t ch; 306 uint16_t *p, l; 307 308 ch = th->th_sum = 0x0000; 309 l = th->th_off; 310 p = (uint16_t *)th; 311 while (l > 0) { 312 ch += *p; 313 p++; 314 ch += *p; 315 p++; 316 l--; 317 } 318 while (ch > 0xffff) 319 ch = (ch >> 16) + (ch & 0xffff); 320 321 return (ch & 0xffff); 322 } 323 324 static uint16_t 325 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, 326 uint16_t tcp_data_len, uint16_t csum) 327 { 328 uint32_t c; 329 uint16_t cs; 330 331 c = csum; 332 333 /* Remove length from checksum. */ 334 switch (le->eh_type) { 335 #ifdef INET6 336 case ETHERTYPE_IPV6: 337 { 338 struct ip6_hdr *ip6; 339 340 ip6 = (struct ip6_hdr *)l3hdr; 341 if (le->append_cnt == 0) 342 cs = ip6->ip6_plen; 343 else { 344 uint32_t cx; 345 346 cx = ntohs(ip6->ip6_plen); 347 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); 348 } 349 break; 350 } 351 #endif 352 #ifdef INET 353 case ETHERTYPE_IP: 354 { 355 struct ip *ip4; 356 357 ip4 = (struct ip *)l3hdr; 358 if (le->append_cnt == 0) 359 cs = ip4->ip_len; 360 else { 361 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), 362 IPPROTO_TCP); 363 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, 364 htons(cs)); 365 } 366 break; 367 } 368 #endif 369 default: 370 cs = 0; /* Keep compiler happy. */ 371 } 372 373 cs = ~cs; 374 c += cs; 375 376 /* Remove TCP header csum. */ 377 cs = ~tcp_lro_csum_th(th); 378 c += cs; 379 while (c > 0xffff) 380 c = (c >> 16) + (c & 0xffff); 381 382 return (c & 0xffff); 383 } 384 385 static void 386 tcp_lro_rx_done(struct lro_ctrl *lc) 387 { 388 struct lro_entry *le; 389 390 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 391 tcp_lro_active_remove(le); 392 tcp_lro_flush(lc, le); 393 } 394 } 395 396 void 397 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 398 { 399 struct lro_entry *le, *le_tmp; 400 struct timeval tv; 401 402 if (LIST_EMPTY(&lc->lro_active)) 403 return; 404 405 getmicrouptime(&tv); 406 timevalsub(&tv, timeout); 407 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 408 if (timevalcmp(&tv, &le->mtime, >=)) { 409 tcp_lro_active_remove(le); 410 tcp_lro_flush(lc, le); 411 } 412 } 413 } 414 415 #ifdef INET6 416 static int 417 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, 418 struct tcphdr **th) 419 { 420 421 /* XXX-BZ we should check the flow-label. */ 422 423 /* XXX-BZ We do not yet support ext. hdrs. */ 424 if (ip6->ip6_nxt != IPPROTO_TCP) 425 return (TCP_LRO_NOT_SUPPORTED); 426 427 /* Find the TCP header. */ 428 *th = (struct tcphdr *)(ip6 + 1); 429 430 return (0); 431 } 432 #endif 433 434 #ifdef INET 435 static int 436 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, 437 struct tcphdr **th) 438 { 439 int csum_flags; 440 uint16_t csum; 441 442 if (ip4->ip_p != IPPROTO_TCP) 443 return (TCP_LRO_NOT_SUPPORTED); 444 445 /* Ensure there are no options. */ 446 if ((ip4->ip_hl << 2) != sizeof (*ip4)) 447 return (TCP_LRO_CANNOT); 448 449 /* .. and the packet is not fragmented. */ 450 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 451 return (TCP_LRO_CANNOT); 452 453 /* Legacy IP has a header checksum that needs to be correct. */ 454 csum_flags = m->m_pkthdr.csum_flags; 455 if (csum_flags & CSUM_IP_CHECKED) { 456 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 457 lc->lro_bad_csum++; 458 return (TCP_LRO_CANNOT); 459 } 460 } else { 461 csum = in_cksum_hdr(ip4); 462 if (__predict_false((csum) != 0)) { 463 lc->lro_bad_csum++; 464 return (TCP_LRO_CANNOT); 465 } 466 } 467 /* Find the TCP header (we assured there are no IP options). */ 468 *th = (struct tcphdr *)(ip4 + 1); 469 return (0); 470 } 471 #endif 472 473 #ifdef TCPHPTS 474 static void 475 tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc, 476 struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len, 477 uint32_t th_seq , uint32_t th_ack, uint16_t th_win) 478 { 479 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 480 union tcp_log_stackspecific log; 481 struct timeval tv; 482 uint32_t cts; 483 484 cts = tcp_get_usecs(&tv); 485 memset(&log, 0, sizeof(union tcp_log_stackspecific)); 486 log.u_bbr.flex8 = frm; 487 log.u_bbr.flex1 = tcp_data_len; 488 if (m) 489 log.u_bbr.flex2 = m->m_pkthdr.len; 490 else 491 log.u_bbr.flex2 = 0; 492 log.u_bbr.flex3 = le->append_cnt; 493 log.u_bbr.flex4 = le->p_len; 494 if (le->m_head) { 495 log.u_bbr.flex5 = le->m_head->m_pkthdr.len; 496 log.u_bbr.delRate = le->m_head->m_flags; 497 log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; 498 } 499 log.u_bbr.inflight = th_seq; 500 log.u_bbr.timeStamp = cts; 501 log.u_bbr.epoch = le->next_seq; 502 log.u_bbr.delivered = th_ack; 503 log.u_bbr.lt_epoch = le->ack_seq; 504 log.u_bbr.pacing_gain = th_win; 505 log.u_bbr.cwnd_gain = le->window; 506 log.u_bbr.cur_del_rate = (uintptr_t)m; 507 log.u_bbr.bw_inuse = (uintptr_t)le->m_head; 508 log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */ 509 log.u_bbr.applimited = le->ulp_csum; 510 log.u_bbr.lost = le->mbuf_appended; 511 log.u_bbr.pkt_epoch = le->cmp_ack_cnt; 512 log.u_bbr.flex6 = tcp_tv_to_usectick(&lc->lro_last_flush); 513 if (in_epoch(net_epoch_preempt)) 514 log.u_bbr.inhpts = 1; 515 else 516 log.u_bbr.inhpts = 0; 517 TCP_LOG_EVENTP(tp, NULL, 518 &tp->t_inpcb->inp_socket->so_rcv, 519 &tp->t_inpcb->inp_socket->so_snd, 520 TCP_LOG_LRO, 0, 521 0, &log, false, &tv); 522 } 523 } 524 #endif 525 526 static void 527 tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) 528 { 529 if (le->append_cnt > 1) { 530 struct tcphdr *th; 531 uint16_t p_len; 532 533 p_len = htons(le->p_len); 534 switch (le->eh_type) { 535 #ifdef INET6 536 case ETHERTYPE_IPV6: 537 { 538 struct ip6_hdr *ip6; 539 540 ip6 = le->le_ip6; 541 ip6->ip6_plen = p_len; 542 th = (struct tcphdr *)(ip6 + 1); 543 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 544 CSUM_PSEUDO_HDR; 545 le->p_len += ETHER_HDR_LEN + sizeof(*ip6); 546 break; 547 } 548 #endif 549 #ifdef INET 550 case ETHERTYPE_IP: 551 { 552 struct ip *ip4; 553 uint32_t cl; 554 uint16_t c; 555 556 ip4 = le->le_ip4; 557 /* Fix IP header checksum for new length. */ 558 c = ~ip4->ip_sum; 559 cl = c; 560 c = ~ip4->ip_len; 561 cl += c + p_len; 562 while (cl > 0xffff) 563 cl = (cl >> 16) + (cl & 0xffff); 564 c = cl; 565 ip4->ip_sum = ~c; 566 ip4->ip_len = p_len; 567 th = (struct tcphdr *)(ip4 + 1); 568 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 569 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 570 le->p_len += ETHER_HDR_LEN; 571 break; 572 } 573 #endif 574 default: 575 th = NULL; /* Keep compiler happy. */ 576 } 577 le->m_head->m_pkthdr.csum_data = 0xffff; 578 le->m_head->m_pkthdr.len = le->p_len; 579 580 /* Incorporate the latest ACK into the TCP header. */ 581 th->th_ack = le->ack_seq; 582 th->th_win = le->window; 583 /* Incorporate latest timestamp into the TCP header. */ 584 if (le->timestamp != 0) { 585 uint32_t *ts_ptr; 586 587 ts_ptr = (uint32_t *)(th + 1); 588 ts_ptr[1] = htonl(le->tsval); 589 ts_ptr[2] = le->tsecr; 590 } 591 /* Update the TCP header checksum. */ 592 le->ulp_csum += p_len; 593 le->ulp_csum += tcp_lro_csum_th(th); 594 while (le->ulp_csum > 0xffff) 595 le->ulp_csum = (le->ulp_csum >> 16) + 596 (le->ulp_csum & 0xffff); 597 th->th_sum = (le->ulp_csum & 0xffff); 598 th->th_sum = ~th->th_sum; 599 } 600 /* 601 * Break any chain, this is not set to NULL on the singleton 602 * case m_nextpkt points to m_head. Other case set them 603 * m_nextpkt to NULL in push_and_replace. 604 */ 605 le->m_head->m_nextpkt = NULL; 606 le->m_head->m_pkthdr.lro_nsegs = le->append_cnt; 607 (*lc->ifp->if_input)(lc->ifp, le->m_head); 608 lc->lro_queued += le->append_cnt; 609 } 610 611 static void 612 tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 613 { 614 struct ether_header *eh; 615 void *l3hdr = NULL; /* Keep compiler happy. */ 616 struct tcphdr *th; 617 #ifdef INET6 618 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 619 #endif 620 #ifdef INET 621 struct ip *ip4 = NULL; /* Keep compiler happy. */ 622 #endif 623 uint32_t *ts_ptr; 624 int error, l, ts_failed = 0; 625 uint16_t tcp_data_len; 626 uint16_t csum; 627 628 error = -1; 629 eh = mtod(m, struct ether_header *); 630 /* 631 * We must reset the other pointers since the mbuf 632 * we were pointing too is about to go away. 633 */ 634 switch (le->eh_type) { 635 #ifdef INET6 636 case ETHERTYPE_IPV6: 637 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 638 error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 639 le->le_ip6 = ip6; 640 le->source_ip6 = ip6->ip6_src; 641 le->dest_ip6 = ip6->ip6_dst; 642 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 643 break; 644 #endif 645 #ifdef INET 646 case ETHERTYPE_IP: 647 l3hdr = ip4 = (struct ip *)(eh + 1); 648 error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 649 le->le_ip4 = ip4; 650 le->source_ip4 = ip4->ip_src.s_addr; 651 le->dest_ip4 = ip4->ip_dst.s_addr; 652 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 653 break; 654 #endif 655 } 656 KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n", 657 __func__, le)); 658 ts_ptr = (uint32_t *)(th + 1); 659 l = (th->th_off << 2); 660 l -= sizeof(*th); 661 if (l != 0 && 662 (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 663 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 664 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 665 /* We have failed to find a timestamp some other option? */ 666 ts_failed = 1; 667 } 668 if ((l != 0) && (ts_failed == 0)) { 669 le->timestamp = 1; 670 le->tsval = ntohl(*(ts_ptr + 1)); 671 le->tsecr = *(ts_ptr + 2); 672 } else 673 le->timestamp = 0; 674 le->source_port = th->th_sport; 675 le->dest_port = th->th_dport; 676 /* Pull out the csum */ 677 tcp_data_len = m->m_pkthdr.lro_len; 678 le->next_seq = ntohl(th->th_seq) + tcp_data_len; 679 le->ack_seq = th->th_ack; 680 le->window = th->th_win; 681 csum = th->th_sum; 682 /* Setup the data pointers */ 683 le->m_head = m; 684 le->m_tail = m_last(m); 685 le->append_cnt = 0; 686 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 687 ~csum); 688 le->append_cnt++; 689 th->th_sum = csum; /* Restore checksum on first packet. */ 690 } 691 692 static void 693 tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 694 { 695 /* 696 * Push up the stack the current le and replace 697 * it with m. 698 */ 699 struct mbuf *msave; 700 701 /* Grab off the next and save it */ 702 msave = le->m_head->m_nextpkt; 703 le->m_head->m_nextpkt = NULL; 704 /* Now push out the old le entry */ 705 tcp_flush_out_le(tp, lc, le); 706 /* 707 * Now to replace the data properly in the le 708 * we have to reset the tcp header and 709 * other fields. 710 */ 711 tcp_set_le_to_m(lc, le, m); 712 /* Restore the next list */ 713 m->m_nextpkt = msave; 714 } 715 716 static void 717 tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) 718 { 719 /* 720 * Walk through the mbuf chain we 721 * have on tap and compress/condense 722 * as required. 723 */ 724 uint32_t *ts_ptr; 725 struct mbuf *m; 726 struct tcphdr *th; 727 uint16_t tcp_data_len, csum_upd; 728 int l; 729 730 /* 731 * First we must check the lead (m_head) 732 * we must make sure that it is *not* 733 * something that should be sent up 734 * right away (sack etc). 735 */ 736 again: 737 738 m = le->m_head->m_nextpkt; 739 if (m == NULL) { 740 /* Just the one left */ 741 return; 742 } 743 if (m->m_flags & M_ACKCMP) 744 panic("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", 745 lc, le, m); 746 th = tcp_lro_get_th(le, le->m_head); 747 KASSERT(th != NULL, 748 ("le:%p m:%p th comes back NULL?", le, le->m_head)); 749 l = (th->th_off << 2); 750 l -= sizeof(*th); 751 ts_ptr = (uint32_t *)(th + 1); 752 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 753 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 754 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 755 /* 756 * Its not the timestamp. We can't 757 * use this guy as the head. 758 */ 759 le->m_head->m_nextpkt = m->m_nextpkt; 760 tcp_push_and_replace(tp, lc, le, m); 761 goto again; 762 } 763 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 764 /* 765 * Make sure that previously seen segements/ACKs are delivered 766 * before this segment, e.g. FIN. 767 */ 768 le->m_head->m_nextpkt = m->m_nextpkt; 769 KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , 770 ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); 771 tcp_push_and_replace(tp, lc, le, m); 772 goto again; 773 } 774 while((m = le->m_head->m_nextpkt) != NULL) { 775 /* 776 * condense m into le, first 777 * pull m out of the list. 778 */ 779 KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , 780 ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); 781 KASSERT(((m->m_flags & M_ACKCMP) == 0), 782 ("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", lc, le, m)); 783 le->m_head->m_nextpkt = m->m_nextpkt; 784 m->m_nextpkt = NULL; 785 /* Setup my data */ 786 tcp_data_len = m->m_pkthdr.lro_len; 787 th = tcp_lro_get_th(le, m); 788 KASSERT(th != NULL, 789 ("le:%p m:%p th comes back NULL?", le, m)); 790 ts_ptr = (uint32_t *)(th + 1); 791 l = (th->th_off << 2); 792 l -= sizeof(*th); 793 if (le->append_cnt >= lc->lro_ackcnt_lim) { 794 tcp_push_and_replace(tp, lc, le, m); 795 goto again; 796 } 797 if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { 798 /* Flush now if appending will result in overflow. */ 799 tcp_push_and_replace(tp, lc, le, m); 800 goto again; 801 } 802 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 803 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 804 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 805 /* 806 * Maybe a sack in the new one? We need to 807 * start all over after flushing the 808 * current le. We will go up to the beginning 809 * and flush it (calling the replace again possibly 810 * or just returning). 811 */ 812 tcp_push_and_replace(tp, lc, le, m); 813 goto again; 814 } 815 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 816 tcp_push_and_replace(tp, lc, le, m); 817 goto again; 818 } 819 if (l != 0) { 820 uint32_t tsval = ntohl(*(ts_ptr + 1)); 821 /* Make sure timestamp values are increasing. */ 822 if (TSTMP_GT(le->tsval, tsval)) { 823 tcp_push_and_replace(tp, lc, le, m); 824 goto again; 825 } 826 le->tsval = tsval; 827 le->tsecr = *(ts_ptr + 2); 828 } 829 /* Try to append the new segment. */ 830 if (__predict_false(ntohl(th->th_seq) != le->next_seq || 831 (tcp_data_len == 0 && 832 le->ack_seq == th->th_ack && 833 le->window == th->th_win))) { 834 /* Out of order packet or duplicate ACK. */ 835 tcp_push_and_replace(tp, lc, le, m); 836 goto again; 837 } 838 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 839 le->next_seq += tcp_data_len; 840 le->ack_seq = th->th_ack; 841 le->window = th->th_win; 842 } else if (th->th_ack == le->ack_seq) { 843 le->window = WIN_MAX(le->window, th->th_win); 844 } 845 csum_upd = m->m_pkthdr.lro_csum; 846 le->ulp_csum += csum_upd; 847 if (tcp_data_len == 0) { 848 le->append_cnt++; 849 le->mbuf_cnt--; 850 m_freem(m); 851 continue; 852 } 853 le->append_cnt++; 854 le->mbuf_appended++; 855 le->p_len += tcp_data_len; 856 /* 857 * Adjust the mbuf so that m_data points to the first byte of 858 * the ULP payload. Adjust the mbuf to avoid complications and 859 * append new segment to existing mbuf chain. 860 */ 861 m_adj(m, m->m_pkthdr.len - tcp_data_len); 862 m_demote_pkthdr(m); 863 le->m_tail->m_next = m; 864 le->m_tail = m_last(m); 865 } 866 } 867 868 #ifdef TCPHPTS 869 static void 870 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) 871 { 872 if (tp->t_in_pkt == NULL) { 873 /* Nothing yet there */ 874 tp->t_in_pkt = le->m_head; 875 tp->t_tail_pkt = le->m_last_mbuf; 876 } else { 877 /* Already some there */ 878 tp->t_tail_pkt->m_nextpkt = le->m_head; 879 tp->t_tail_pkt = le->m_last_mbuf; 880 } 881 le->m_head = NULL; 882 le->m_last_mbuf = NULL; 883 } 884 885 static struct mbuf * 886 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, struct inpcb *inp) 887 { 888 struct mbuf *m = NULL; 889 struct tcpcb *tp; 890 891 tp = intotcpcb(inp); 892 if (tp) { 893 /* Look at the last mbuf if any in queue */ 894 if ((tp->t_tail_pkt) && 895 (tp->t_tail_pkt->m_flags & M_ACKCMP)) { 896 if (M_TRAILINGSPACE(tp->t_tail_pkt) >= sizeof(struct tcp_ackent)) { 897 tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); 898 m = tp->t_tail_pkt; 899 } else { 900 if ((inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { 901 counter_u64_add(tcp_would_have_but, 1); 902 inp->inp_flags2 |= INP_MBUF_L_ACKS; 903 } 904 } 905 } 906 } 907 return (m); 908 } 909 910 static struct inpcb * 911 tcp_lro_lookup(struct lro_ctrl *lc, struct lro_entry *le) 912 { 913 struct inpcb *inp = NULL; 914 915 NET_EPOCH_ASSERT(); 916 switch (le->eh_type) { 917 #ifdef INET6 918 case ETHERTYPE_IPV6: 919 inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6, 920 le->source_port, &le->dest_ip6,le->dest_port, 921 INPLOOKUP_WLOCKPCB, 922 lc->ifp); 923 break; 924 #endif 925 #ifdef INET 926 case ETHERTYPE_IP: 927 inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src, 928 le->source_port, le->le_ip4->ip_dst, le->dest_port, 929 INPLOOKUP_WLOCKPCB, 930 lc->ifp); 931 break; 932 #endif 933 } 934 return (inp); 935 } 936 937 #endif 938 #ifdef NO 939 static void 940 stack_guard_prep(uint32_t *sg, int len) 941 { 942 int i; 943 944 for (i = 0; i < len; i++) { 945 sg[i] = 0xdeadc0de; 946 } 947 } 948 949 static void 950 stack_guard_check(struct lro_ctrl *lc, struct lro_entry *le, uint32_t *sg, int len) 951 { 952 int i; 953 954 for (i = 0; i < len; i++) { 955 if (sg[i] != 0xdeadc0de) 956 panic("Stack guard fails sg[%d] = 0x%x le:%p lc:%p sg:%p\n", 957 i, sg[i], le, lc, sg); 958 } 959 } 960 #endif 961 962 void 963 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 964 { 965 struct tcpcb *tp = NULL; 966 #ifdef TCPHPTS 967 struct inpcb *inp = NULL; 968 int need_wakeup = 0, can_queue = 0; 969 970 /* Now lets lookup the inp first */ 971 CURVNET_SET(lc->ifp->if_vnet); 972 /* 973 * XXXRRS Currently the common input handler for 974 * mbuf queuing cannot handle VLAN Tagged. This needs 975 * to be fixed and the or condition removed (i.e. the 976 * common code should do the right lookup for the vlan 977 * tag and anything else that the vlan_input() does). 978 */ 979 if (le->m_head == NULL) { 980 /* 981 * Everything was pushed up to the stack nothing to do 982 * but release the reference and be done. 983 */ 984 if (le->inp) { 985 INP_WLOCK(le->inp); 986 if (in_pcbrele_wlocked(le->inp) == 0) { 987 /* 988 * We released it and still 989 * have the lock. 990 */ 991 INP_WUNLOCK(le->inp); 992 } 993 } 994 goto done; 995 } 996 if ((tcplro_stacks_wanting_mbufq == 0) || (le->m_head->m_flags & M_VLANTAG)) 997 goto skip_lookup; 998 999 if (le->inp == NULL) { 1000 le->inp = inp = tcp_lro_lookup(lc, le); 1001 if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1002 (inp->inp_flags2 & INP_FREED))) { 1003 /* 1004 * We can't present these to the inp since 1005 * it will not support the stripped ethernet 1006 * header that these have nor if a compressed 1007 * ack is presnet. 1008 */ 1009 INP_WUNLOCK(inp); 1010 lro_free_mbuf_chain(le->m_head); 1011 goto done; 1012 } 1013 if ((le->flags & HAS_COMP_ENTRIES) && 1014 ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { 1015 /* 1016 * It swapped to off, must be a stack 1017 * switch. We need to ditch all the packets 1018 * and the peer will just have to retransmit. 1019 */ 1020 INP_WUNLOCK(inp); 1021 lro_free_mbuf_chain(le->m_head); 1022 goto done; 1023 } 1024 } else { 1025 /* We have a reference on the inp lets lock and release it */ 1026 inp = le->inp; 1027 INP_WLOCK(inp); 1028 if (in_pcbrele_wlocked(inp)) { 1029 /* 1030 * We lost the inp. We can't present these to the inp since 1031 * it will not support the stripped off etherent header. 1032 */ 1033 lro_free_mbuf_chain(le->m_head); 1034 goto done; 1035 } 1036 if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1037 (inp->inp_flags2 & INP_FREED))) { 1038 /* 1039 * We can't present these to the inp since 1040 * it may not support them. 1041 */ 1042 INP_WUNLOCK(inp); 1043 lro_free_mbuf_chain(le->m_head); 1044 goto done; 1045 } 1046 if ((le->flags & HAS_COMP_ENTRIES) && 1047 ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { 1048 /* 1049 * It swapped to off, must be a stack 1050 * switch. We need to ditch all the packets 1051 * and the peer will just have to retransmit. 1052 */ 1053 INP_WUNLOCK(inp); 1054 lro_free_mbuf_chain(le->m_head); 1055 goto done; 1056 } 1057 } 1058 if (inp && ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) || 1059 (inp->inp_flags2 & INP_MBUF_ACKCMP))) { 1060 /* The transport supports mbuf queuing */ 1061 can_queue = 1; 1062 if (le->need_wakeup || 1063 ((inp->inp_in_input == 0) && 1064 ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) { 1065 /* 1066 * Either the transport is off on a keep-alive 1067 * (it has the queue_ready flag clear and its 1068 * not already been woken) or the entry has 1069 * some urgent thing (FIN or possibly SACK blocks). 1070 * This means we need to wake the transport up by 1071 * putting it on the input pacer. 1072 */ 1073 need_wakeup = 1; 1074 if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) && 1075 (le->need_wakeup != 1)) { 1076 /* 1077 * Prohibited from a sack wakeup. 1078 */ 1079 need_wakeup = 0; 1080 } 1081 } 1082 /* Do we need to be awoken due to lots of data or acks? */ 1083 if ((le->tcp_tot_p_len >= lc->lro_length_lim) || 1084 (le->mbuf_cnt >= lc->lro_ackcnt_lim)) 1085 need_wakeup = 1; 1086 } 1087 if (inp) 1088 tp = intotcpcb(inp); 1089 else 1090 tp = NULL; 1091 if (can_queue) { 1092 counter_u64_add(tcp_inp_lro_direct_queue, 1); 1093 tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup, 1094 inp->inp_flags2, inp->inp_in_input, le->need_wakeup); 1095 tcp_queue_pkts(tp, le); 1096 if (need_wakeup) { 1097 /* 1098 * We must get the guy to wakeup via 1099 * hpts. 1100 */ 1101 NET_EPOCH_ASSERT(); 1102 if (le->need_wakeup == 2) { 1103 /* 1104 * The value 2 is set if the 1105 * options are unrecognized i.e. 1106 * not just a timestamp. So really 1107 * sack is usually what it is but 1108 * it might be some other option (CWR 1109 * etc). 1110 */ 1111 counter_u64_add(tcp_inp_lro_sack_wake, 1); 1112 } 1113 counter_u64_add(tcp_inp_lro_wokeup_queue, 1); 1114 if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0)) { 1115 inp = NULL; 1116 } 1117 } 1118 } 1119 if (inp) { 1120 /* Unlock it */ 1121 tp = NULL; 1122 counter_u64_add(tcp_inp_lro_locks_taken, 1); 1123 INP_WUNLOCK(inp); 1124 } 1125 if (can_queue == 0) { 1126 skip_lookup: 1127 if (le->strip_cnt) { 1128 /* 1129 * We have stripped mbufs, the connection 1130 * must have changed underneath us. You 1131 * loose the packets as a penalty. 1132 */ 1133 lro_free_mbuf_chain(le->m_head); 1134 goto done; 1135 } 1136 #endif /* TCPHPTS */ 1137 /* Old fashioned lro method */ 1138 if (le->m_head != le->m_last_mbuf) { 1139 counter_u64_add(tcp_inp_lro_compressed, 1); 1140 tcp_lro_condense(tp, lc, le); 1141 } else 1142 counter_u64_add(tcp_inp_lro_single_push, 1); 1143 tcp_flush_out_le(tp, lc, le); 1144 #ifdef TCPHPTS 1145 } 1146 done: 1147 CURVNET_RESTORE(); 1148 #endif 1149 lc->lro_flushed++; 1150 bzero(le, sizeof(*le)); 1151 LIST_INSERT_HEAD(&lc->lro_free, le, next); 1152 } 1153 1154 #ifdef HAVE_INLINE_FLSLL 1155 #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) 1156 #else 1157 static inline uint64_t 1158 tcp_lro_msb_64(uint64_t x) 1159 { 1160 x |= (x >> 1); 1161 x |= (x >> 2); 1162 x |= (x >> 4); 1163 x |= (x >> 8); 1164 x |= (x >> 16); 1165 x |= (x >> 32); 1166 return (x & ~(x >> 1)); 1167 } 1168 #endif 1169 1170 /* 1171 * The tcp_lro_sort() routine is comparable to qsort(), except it has 1172 * a worst case complexity limit of O(MIN(N,64)*N), where N is the 1173 * number of elements to sort and 64 is the number of sequence bits 1174 * available. The algorithm is bit-slicing the 64-bit sequence number, 1175 * sorting one bit at a time from the most significant bit until the 1176 * least significant one, skipping the constant bits. This is 1177 * typically called a radix sort. 1178 */ 1179 static void 1180 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) 1181 { 1182 struct lro_mbuf_sort temp; 1183 uint64_t ones; 1184 uint64_t zeros; 1185 uint32_t x; 1186 uint32_t y; 1187 1188 repeat: 1189 /* for small arrays insertion sort is faster */ 1190 if (size <= 12) { 1191 for (x = 1; x < size; x++) { 1192 temp = parray[x]; 1193 for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) 1194 parray[y] = parray[y - 1]; 1195 parray[y] = temp; 1196 } 1197 return; 1198 } 1199 1200 /* compute sequence bits which are constant */ 1201 ones = 0; 1202 zeros = 0; 1203 for (x = 0; x != size; x++) { 1204 ones |= parray[x].seq; 1205 zeros |= ~parray[x].seq; 1206 } 1207 1208 /* compute bits which are not constant into "ones" */ 1209 ones &= zeros; 1210 if (ones == 0) 1211 return; 1212 1213 /* pick the most significant bit which is not constant */ 1214 ones = tcp_lro_msb_64(ones); 1215 1216 /* 1217 * Move entries having cleared sequence bits to the beginning 1218 * of the array: 1219 */ 1220 for (x = y = 0; y != size; y++) { 1221 /* skip set bits */ 1222 if (parray[y].seq & ones) 1223 continue; 1224 /* swap entries */ 1225 temp = parray[x]; 1226 parray[x] = parray[y]; 1227 parray[y] = temp; 1228 x++; 1229 } 1230 1231 KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); 1232 1233 /* sort zeros */ 1234 tcp_lro_sort(parray, x); 1235 1236 /* sort ones */ 1237 parray += x; 1238 size -= x; 1239 goto repeat; 1240 } 1241 1242 void 1243 tcp_lro_flush_all(struct lro_ctrl *lc) 1244 { 1245 uint64_t seq; 1246 uint64_t nseq; 1247 unsigned x; 1248 1249 /* check if no mbufs to flush */ 1250 if (lc->lro_mbuf_count == 0) 1251 goto done; 1252 1253 microuptime(&lc->lro_last_flush); 1254 /* sort all mbufs according to stream */ 1255 tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); 1256 1257 /* input data into LRO engine, stream by stream */ 1258 seq = 0; 1259 for (x = 0; x != lc->lro_mbuf_count; x++) { 1260 struct mbuf *mb; 1261 1262 /* get mbuf */ 1263 mb = lc->lro_mbuf_data[x].mb; 1264 1265 /* get sequence number, masking away the packet index */ 1266 nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); 1267 1268 /* check for new stream */ 1269 if (seq != nseq) { 1270 seq = nseq; 1271 1272 /* flush active streams */ 1273 tcp_lro_rx_done(lc); 1274 } 1275 1276 /* add packet to LRO engine */ 1277 if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { 1278 /* input packet to network layer */ 1279 (*lc->ifp->if_input)(lc->ifp, mb); 1280 lc->lro_queued++; 1281 lc->lro_flushed++; 1282 } 1283 } 1284 done: 1285 /* flush active streams */ 1286 tcp_lro_rx_done(lc); 1287 1288 lc->lro_mbuf_count = 0; 1289 } 1290 1291 static void 1292 lro_set_mtime(struct timeval *tv, struct timespec *ts) 1293 { 1294 tv->tv_sec = ts->tv_sec; 1295 tv->tv_usec = ts->tv_nsec / 1000; 1296 } 1297 1298 #ifdef TCPHPTS 1299 static void 1300 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, uint16_t hdr_len, uint16_t iptos) 1301 { 1302 /* 1303 * Given a TCP ack, summarize it down into the small tcp 1304 * ack entry. 1305 */ 1306 u_char *cp; 1307 1308 1309 KASSERT(((th->th_flags & ~(TH_ACK | TH_PUSH | TH_CWR | TH_ECE)) == 0), 1310 ("tcphdr:%p mbuf:%p has unallowed bits %x", th, m, th->th_flags)); 1311 ae->timestamp = m->m_pkthdr.rcv_tstmp; 1312 if (m->m_flags & M_TSTMP_LRO) 1313 ae->flags = TSTMP_LRO; 1314 else if (m->m_flags & M_TSTMP) 1315 ae->flags = TSTMP_HDWR; 1316 ae->seq = ntohl(th->th_seq); 1317 ae->ack = ntohl(th->th_ack); 1318 ae->flags |= th->th_flags; 1319 if (hdr_len) { 1320 /* We have a timestamp options get out the bits */ 1321 cp = (u_char *)(th + 1); 1322 /* Skip the two NOP's at the front */ 1323 while (*cp == TCPOPT_NOP) 1324 cp++; 1325 KASSERT(((*cp == TCPOPT_TIMESTAMP) && 1326 (cp[1] == TCPOLEN_TIMESTAMP)), 1327 ("At %p in tcphdr:%p options of %d not timestamp", 1328 cp, th, hdr_len)); 1329 bcopy((char *)cp + 2, 1330 (char *)&ae->ts_value, sizeof(uint32_t)); 1331 ae->ts_value = ntohl(ae->ts_value); 1332 bcopy((char *)cp + 6, 1333 (char *)&ae->ts_echo, sizeof(uint32_t)); 1334 ae->ts_echo = ntohl(ae->ts_echo); 1335 ae->flags |= HAS_TSTMP; 1336 } 1337 ae->win = ntohs(th->th_win); 1338 ae->codepoint = iptos; 1339 } 1340 1341 static struct mbuf * 1342 do_bpf_and_csum(struct inpcb *inp, struct lro_ctrl *lc, struct lro_entry *le, 1343 struct ether_header *eh, struct mbuf *m, int bpf_req, int locked) 1344 { 1345 /* 1346 * Do TCP/IP checksum and BPF tap for either ACK_CMP packets or 1347 * MBUF QUEUE type packets. 1348 */ 1349 struct tcphdr *th; 1350 #ifdef INET6 1351 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 1352 #endif 1353 #ifdef INET 1354 struct ip *ip = NULL; /* Keep compiler happy. */ 1355 #endif 1356 1357 uint16_t drop_hdrlen; 1358 int etype, tlen; 1359 #ifdef INET 1360 uint8_t iptos; 1361 #endif 1362 1363 /* Let the BPF see the packet */ 1364 if (bpf_req && lc->ifp) 1365 ETHER_BPF_MTAP(lc->ifp, m); 1366 /* Get type and Trim off the ethernet header */ 1367 m->m_pkthdr.lro_etype = etype = ntohs(eh->ether_type); 1368 m_adj(m, sizeof(*eh)); 1369 m->m_flags |= M_LRO_EHDRSTRP; 1370 switch (etype) { 1371 #ifdef INET6 1372 case ETHERTYPE_IPV6: 1373 { 1374 if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { 1375 m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); 1376 if (m == NULL) { 1377 TCPSTAT_INC(tcps_rcvshort); 1378 m_freem(m); 1379 return (NULL); 1380 } 1381 } 1382 ip6 = (struct ip6_hdr *)(eh + 1); 1383 th = (struct tcphdr *)(ip6 + 1); 1384 tlen = ntohs(ip6->ip6_plen); 1385 drop_hdrlen = sizeof(*ip6); 1386 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { 1387 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 1388 counter_u64_add(tcp_csum_hardware_w_ph, 1); 1389 th->th_sum = m->m_pkthdr.csum_data; 1390 } else { 1391 counter_u64_add(tcp_csum_hardware, 1); 1392 th->th_sum = in6_cksum_pseudo(ip6, tlen, 1393 IPPROTO_TCP, m->m_pkthdr.csum_data); 1394 } 1395 th->th_sum ^= 0xffff; 1396 } else { 1397 counter_u64_add(tcp_csum_software, 1); 1398 th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); 1399 } 1400 if (th->th_sum) { 1401 TCPSTAT_INC(tcps_rcvbadsum); 1402 if (locked) { 1403 /* Log the bad news */ 1404 struct tcpcb *tp = intotcpcb(inp); 1405 1406 tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); 1407 } 1408 m_freem(m); 1409 return (NULL); 1410 } 1411 1412 1413 1414 1415 /* 1416 * Be proactive about unspecified IPv6 address in source. 1417 * As we use all-zero to indicate unbounded/unconnected pcb, 1418 * unspecified IPv6 address can be used to confuse us. 1419 * 1420 * Note that packets with unspecified IPv6 destination is 1421 * already dropped in ip6_input. 1422 */ 1423 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1424 /* XXX stat */ 1425 m_freem(m); 1426 return (NULL); 1427 } 1428 break; 1429 } 1430 #endif 1431 #ifdef INET 1432 case ETHERTYPE_IP: 1433 { 1434 if (m->m_len < sizeof (struct tcpiphdr)) { 1435 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 1436 == NULL) { 1437 TCPSTAT_INC(tcps_rcvshort); 1438 m_freem(m); 1439 return (NULL); 1440 } 1441 } 1442 ip = (struct ip *)(eh + 1); 1443 th = (struct tcphdr *)(ip + 1); 1444 iptos = ip->ip_tos; 1445 drop_hdrlen = sizeof(*ip); 1446 tlen = ntohs(ip->ip_len) - sizeof(struct ip); 1447 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 1448 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 1449 counter_u64_add(tcp_csum_hardware_w_ph, 1); 1450 th->th_sum = m->m_pkthdr.csum_data; 1451 } else { 1452 counter_u64_add(tcp_csum_hardware, 1); 1453 th->th_sum = in_pseudo(ip->ip_src.s_addr, 1454 ip->ip_dst.s_addr, 1455 htonl(m->m_pkthdr.csum_data + tlen + 1456 IPPROTO_TCP)); 1457 } 1458 th->th_sum ^= 0xffff; 1459 } else { 1460 int len; 1461 struct ipovly *ipov = (struct ipovly *)ip; 1462 /* 1463 * Checksum extended TCP header and data. 1464 */ 1465 counter_u64_add(tcp_csum_software, 1); 1466 len = drop_hdrlen + tlen; 1467 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 1468 ipov->ih_len = htons(tlen); 1469 th->th_sum = in_cksum(m, len); 1470 /* Reset length for SDT probes. */ 1471 ip->ip_len = htons(len); 1472 /* Reset TOS bits */ 1473 ip->ip_tos = iptos; 1474 /* Re-initialization for later version check */ 1475 ip->ip_v = IPVERSION; 1476 ip->ip_hl = sizeof(*ip) >> 2; 1477 } 1478 if (th->th_sum) { 1479 TCPSTAT_INC(tcps_rcvbadsum); 1480 if (locked) { 1481 /* Log the bad news */ 1482 struct tcpcb *tp = intotcpcb(inp); 1483 1484 tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); 1485 } 1486 m_freem(m); 1487 return (NULL); 1488 } 1489 break; 1490 } 1491 #endif 1492 } /* end switch */ 1493 return (m); 1494 } 1495 #endif 1496 1497 static int 1498 tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) 1499 { 1500 struct lro_entry *le; 1501 struct ether_header *eh; 1502 #ifdef INET6 1503 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 1504 #endif 1505 #ifdef INET 1506 struct ip *ip4 = NULL; /* Keep compiler happy. */ 1507 #endif 1508 struct tcphdr *th; 1509 void *l3hdr = NULL; /* Keep compiler happy. */ 1510 uint32_t *ts_ptr; 1511 tcp_seq seq; 1512 int error, ip_len, hdr_len, locked = 0; 1513 uint16_t eh_type, tcp_data_len, need_flush; 1514 #ifdef TCPHPTS 1515 uint16_t iptos; 1516 #endif 1517 struct lro_head *bucket; 1518 struct timespec arrv; 1519 1520 /* Clear the flags we may use to communicate with TCP */ 1521 m->m_flags &= ~(M_ACKCMP|M_LRO_EHDRSTRP); 1522 1523 /* We expect a contiguous header [eh, ip, tcp]. */ 1524 if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) { 1525 /* If no hardware or arrival stamp on the packet add arrival */ 1526 nanouptime(&arrv); 1527 m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec; 1528 m->m_flags |= M_TSTMP_LRO; 1529 } 1530 eh = mtod(m, struct ether_header *); 1531 eh_type = ntohs(eh->ether_type); 1532 switch (eh_type) { 1533 #ifdef INET6 1534 case ETHERTYPE_IPV6: 1535 { 1536 CURVNET_SET(lc->ifp->if_vnet); 1537 if (V_ip6_forwarding != 0) { 1538 /* XXX-BZ stats but changing lro_ctrl is a problem. */ 1539 CURVNET_RESTORE(); 1540 return (TCP_LRO_CANNOT); 1541 } 1542 CURVNET_RESTORE(); 1543 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 1544 error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 1545 if (error != 0) 1546 return (error); 1547 tcp_data_len = ntohs(ip6->ip6_plen); 1548 #ifdef TCPHPTS 1549 iptos = IPV6_TRAFFIC_CLASS(ip6); 1550 #endif 1551 ip_len = sizeof(*ip6) + tcp_data_len; 1552 break; 1553 } 1554 #endif 1555 #ifdef INET 1556 case ETHERTYPE_IP: 1557 { 1558 CURVNET_SET(lc->ifp->if_vnet); 1559 if (V_ipforwarding != 0) { 1560 /* XXX-BZ stats but changing lro_ctrl is a problem. */ 1561 CURVNET_RESTORE(); 1562 return (TCP_LRO_CANNOT); 1563 } 1564 CURVNET_RESTORE(); 1565 l3hdr = ip4 = (struct ip *)(eh + 1); 1566 error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 1567 if (error != 0) 1568 return (error); 1569 ip_len = ntohs(ip4->ip_len); 1570 #ifdef TCPHPTS 1571 iptos = ip4->ip_tos; 1572 #endif 1573 tcp_data_len = ip_len - sizeof(*ip4); 1574 break; 1575 } 1576 #endif 1577 /* XXX-BZ what happens in case of VLAN(s)? */ 1578 default: 1579 return (TCP_LRO_NOT_SUPPORTED); 1580 } 1581 1582 /* 1583 * If the frame is padded beyond the end of the IP packet, then we must 1584 * trim the extra bytes off. 1585 */ 1586 hdr_len = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); 1587 if (hdr_len != 0) { 1588 if (hdr_len < 0) 1589 /* Truncated packet. */ 1590 return (TCP_LRO_CANNOT); 1591 1592 m_adj(m, -hdr_len); 1593 } 1594 /* 1595 * Check TCP header constraints. 1596 */ 1597 hdr_len = (th->th_off << 2); 1598 ts_ptr = (uint32_t *)(th + 1); 1599 tcp_data_len -= hdr_len; 1600 hdr_len -= sizeof(*th); 1601 if (th->th_flags & TH_SYN) 1602 return (TCP_LRO_CANNOT); 1603 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 1604 need_flush = 1; 1605 } else 1606 need_flush = 0; 1607 if (hdr_len != 0 && (__predict_false(hdr_len != TCPOLEN_TSTAMP_APPA) || 1608 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 1609 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 1610 /* 1611 * We have an option besides Timestamps, maybe 1612 * it is a sack (most likely) which means we 1613 * will probably need to wake up a sleeper (if 1614 * the guy does queueing). 1615 */ 1616 need_flush = 2; 1617 } 1618 /* If the driver did not pass in the checksum, set it now. */ 1619 if (csum == 0x0000) 1620 csum = th->th_sum; 1621 seq = ntohl(th->th_seq); 1622 if (!use_hash) { 1623 bucket = &lc->lro_hash[0]; 1624 } else if (M_HASHTYPE_ISHASH(m)) { 1625 bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; 1626 } else { 1627 uint32_t hash; 1628 1629 switch (eh_type) { 1630 #ifdef INET 1631 case ETHERTYPE_IP: 1632 hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; 1633 break; 1634 #endif 1635 #ifdef INET6 1636 case ETHERTYPE_IPV6: 1637 hash = ip6->ip6_src.s6_addr32[0] + 1638 ip6->ip6_dst.s6_addr32[0]; 1639 hash += ip6->ip6_src.s6_addr32[1] + 1640 ip6->ip6_dst.s6_addr32[1]; 1641 hash += ip6->ip6_src.s6_addr32[2] + 1642 ip6->ip6_dst.s6_addr32[2]; 1643 hash += ip6->ip6_src.s6_addr32[3] + 1644 ip6->ip6_dst.s6_addr32[3]; 1645 break; 1646 #endif 1647 default: 1648 hash = 0; 1649 break; 1650 } 1651 hash += th->th_sport + th->th_dport; 1652 bucket = &lc->lro_hash[hash % lc->lro_hashsz]; 1653 } 1654 1655 /* Try to find a matching previous segment. */ 1656 LIST_FOREACH(le, bucket, hash_next) { 1657 if (le->eh_type != eh_type) 1658 continue; 1659 if (le->source_port != th->th_sport || 1660 le->dest_port != th->th_dport) 1661 continue; 1662 switch (eh_type) { 1663 #ifdef INET6 1664 case ETHERTYPE_IPV6: 1665 if (bcmp(&le->source_ip6, &ip6->ip6_src, 1666 sizeof(struct in6_addr)) != 0 || 1667 bcmp(&le->dest_ip6, &ip6->ip6_dst, 1668 sizeof(struct in6_addr)) != 0) 1669 continue; 1670 break; 1671 #endif 1672 #ifdef INET 1673 case ETHERTYPE_IP: 1674 if (le->source_ip4 != ip4->ip_src.s_addr || 1675 le->dest_ip4 != ip4->ip_dst.s_addr) 1676 continue; 1677 break; 1678 #endif 1679 } 1680 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) || 1681 (th->th_ack == le->ack_seq)) { 1682 m->m_pkthdr.lro_len = tcp_data_len; 1683 } else { 1684 /* no data and old ack */ 1685 m_freem(m); 1686 return (0); 1687 } 1688 #ifdef TCPHPTS 1689 if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) 1690 goto skip_lookup_a; 1691 if (le->inp == NULL) { 1692 CURVNET_SET(lc->ifp->if_vnet); 1693 le->inp = tcp_lro_lookup(lc, le); 1694 if (le->inp) { 1695 in_pcbref(le->inp); 1696 locked = 1; 1697 } 1698 CURVNET_RESTORE(); 1699 } else if (le->inp) { 1700 INP_WLOCK(le->inp); 1701 locked = 1; 1702 } 1703 if (locked && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1704 (le->inp->inp_flags2 & INP_FREED))) { 1705 /* We can't present these to the inp since 1706 * its dead Jim. 1707 */ 1708 int ret; 1709 1710 ret = in_pcbrele_wlocked(le->inp); 1711 if (ret == 0) 1712 INP_WUNLOCK(le->inp); 1713 le->inp = NULL; 1714 locked = 0; 1715 tcp_lro_active_remove(le); 1716 if (le->strip_cnt && le->m_head) { 1717 /* 1718 * If we have any stripped packets we 1719 * just dump the whole chain. The 1720 * tcp_lro_flush code knows how 1721 * to handle things when le->m_head is NULL 1722 * and even le->inp is NULL. 1723 */ 1724 lro_free_mbuf_chain(le->m_head); 1725 le->m_head = NULL; 1726 } 1727 tcp_lro_flush(lc, le); 1728 return (TCP_LRO_CANNOT); 1729 } 1730 /* See if it has been switched on */ 1731 if (le->inp && (le->inp->inp_flags2 & INP_MBUF_ACKCMP)) 1732 le->flags |= CAN_USE_ACKCMP; 1733 1734 if ((need_flush == 1) && 1735 le->inp && 1736 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && 1737 ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { 1738 /* 1739 * For MBUF queuing or ACKCMP we can accept ECE and CWR 1740 * since each packet is sent to the transport (or the 1741 * compressed state including the ECN bits). 1742 */ 1743 need_flush = 0; 1744 } 1745 skip_lookup_a: 1746 #endif 1747 if (need_flush) 1748 le->need_wakeup = need_flush; 1749 /* Save of the data only csum */ 1750 m->m_pkthdr.rcvif = lc->ifp; 1751 m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, 1752 tcp_data_len, ~csum); 1753 th->th_sum = csum; /* Restore checksum */ 1754 #ifdef TCPHPTS 1755 if ((le->flags & CAN_USE_ACKCMP) || 1756 (le->inp && 1757 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { 1758 /* 1759 * Mbuf queued and ACKCMP packets have their BPF and csum 1760 * done here in LRO. They will still end up looking at the 1761 * headers and such (IP/TCP) but we don't want to proceed 1762 * with any bad csum! 1763 */ 1764 m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); 1765 if (m == NULL) { 1766 /* Bad csum, accounting already done */ 1767 if (locked) { 1768 INP_WUNLOCK(le->inp); 1769 } 1770 return (0); 1771 } 1772 le->strip_cnt++; 1773 } 1774 if ((need_flush == 0) && 1775 (th->th_flags & TH_ACK) && 1776 (tcp_data_len == 0) && 1777 (le->flags & CAN_USE_ACKCMP)) { 1778 /* 1779 * Ok this is a pure ack lets find out if our 1780 * last packet already has one of these. 1781 */ 1782 struct mbuf *nm; 1783 struct tcp_ackent *ack_ent; 1784 int idx; 1785 1786 INP_WLOCK_ASSERT(le->inp); 1787 if (le->m_head == NULL) { 1788 /* Ok can we still use the end of the inp's? */ 1789 nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); 1790 if (nm == NULL) { 1791 /* gone or full */ 1792 goto new_one; 1793 } 1794 /* We can add in to the one on the tail */ 1795 ack_ent = mtod(nm, struct tcp_ackent *); 1796 idx = (nm->m_len / sizeof(struct tcp_ackent)); 1797 build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1798 /* Bump the size of both pkt-hdr and len */ 1799 nm->m_len += sizeof(struct tcp_ackent); 1800 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1801 le->ack_seq = th->th_ack; 1802 le->window = th->th_win; 1803 m_freem(m); 1804 counter_u64_add(tcp_extra_mbuf, 1); 1805 INP_WUNLOCK(le->inp); 1806 return (0); 1807 } else if (le->m_last_mbuf->m_flags & M_ACKCMP) { 1808 /* Yes we might be able to be appended to */ 1809 nm = le->m_last_mbuf; 1810 if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { 1811 if ((le->inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { 1812 counter_u64_add(tcp_would_have_but, 1); 1813 le->inp->inp_flags2 |= INP_MBUF_L_ACKS; 1814 } 1815 goto new_one; 1816 } 1817 /* we have room */ 1818 ack_ent = mtod(nm, struct tcp_ackent *); 1819 idx = (nm->m_len / sizeof(struct tcp_ackent)); 1820 build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1821 /* Bump the size of both pkt-hdr and len */ 1822 nm->m_len += sizeof(struct tcp_ackent); 1823 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1824 m_freem(m); 1825 le->flags |= HAS_COMP_ENTRIES; 1826 le->cmp_ack_cnt++; 1827 goto compressed; 1828 } else { 1829 /* Nope we need a new one */ 1830 new_one: 1831 if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) 1832 nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); 1833 else { 1834 nm = m_gethdr(M_NOWAIT, MT_DATA); 1835 nm->m_flags |= M_ACKCMP; 1836 } 1837 if (nm) { 1838 nm->m_pkthdr.rcvif = lc->ifp; 1839 ack_ent = mtod(nm, struct tcp_ackent *); 1840 build_ack_entry(ack_ent, th, m, hdr_len, iptos); 1841 m_freem(m); 1842 m = nm; 1843 m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); 1844 le->flags |= HAS_COMP_ENTRIES; 1845 le->cmp_ack_cnt++; 1846 } 1847 /* We fall through and append */ 1848 } 1849 } 1850 if (m->m_flags & M_ACKCMP) { 1851 counter_u64_add(tcp_comp_total, 1); 1852 } else { 1853 counter_u64_add(tcp_uncomp_total, 1); 1854 } 1855 #endif 1856 /* Save off the tail I am appending too (prev) */ 1857 m->m_nextpkt = NULL; 1858 if (le->m_head == NULL) { 1859 /* 1860 * Case where we wer chaining off the inp 1861 * and now no-longer can. 1862 */ 1863 le->m_head = m; 1864 le->m_tail = m_last(m); 1865 le->m_last_mbuf = m; 1866 le->m_prev_last = NULL; 1867 } else { 1868 le->m_prev_last = le->m_last_mbuf; 1869 /* Mark me in the last spot */ 1870 le->m_last_mbuf->m_nextpkt = m; 1871 /* Now set the tail to me */ 1872 le->m_last_mbuf = m; 1873 le->tcp_tot_p_len += tcp_data_len; 1874 } 1875 #ifdef TCPHPTS 1876 compressed: 1877 #endif 1878 le->mbuf_cnt++; 1879 /* Add to the total size of data */ 1880 lro_set_mtime(&le->mtime, &arrv); 1881 if (locked) 1882 INP_WUNLOCK(le->inp); 1883 return (0); 1884 } 1885 /* Try to find an empty slot. */ 1886 if (LIST_EMPTY(&lc->lro_free)) 1887 return (TCP_LRO_NO_ENTRIES); 1888 1889 /* Start a new segment chain. */ 1890 le = LIST_FIRST(&lc->lro_free); 1891 LIST_REMOVE(le, next); 1892 tcp_lro_active_insert(lc, bucket, le); 1893 lro_set_mtime(&le->mtime, &arrv); 1894 1895 /* Start filling in details. */ 1896 switch (eh_type) { 1897 #ifdef INET6 1898 case ETHERTYPE_IPV6: 1899 le->le_ip6 = ip6; 1900 le->source_ip6 = ip6->ip6_src; 1901 le->dest_ip6 = ip6->ip6_dst; 1902 le->eh_type = eh_type; 1903 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 1904 break; 1905 #endif 1906 #ifdef INET 1907 case ETHERTYPE_IP: 1908 le->le_ip4 = ip4; 1909 le->source_ip4 = ip4->ip_src.s_addr; 1910 le->dest_ip4 = ip4->ip_dst.s_addr; 1911 le->eh_type = eh_type; 1912 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 1913 break; 1914 #endif 1915 } 1916 le->source_port = th->th_sport; 1917 le->dest_port = th->th_dport; 1918 le->next_seq = seq + tcp_data_len; 1919 le->ack_seq = th->th_ack; 1920 le->window = th->th_win; 1921 if (hdr_len != 0) { 1922 le->timestamp = 1; 1923 le->tsval = ntohl(*(ts_ptr + 1)); 1924 le->tsecr = *(ts_ptr + 2); 1925 } 1926 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", 1927 __func__, le, le->ulp_csum)); 1928 1929 le->append_cnt = 0; 1930 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 1931 ~csum); 1932 le->append_cnt++; 1933 th->th_sum = csum; /* Restore checksum */ 1934 m->m_pkthdr.rcvif = lc->ifp; 1935 m->m_pkthdr.lro_len = tcp_data_len; 1936 le->mbuf_cnt = 1; 1937 le->cmp_ack_cnt = 0; 1938 le->flags = 0; 1939 #ifdef TCPHPTS 1940 /* 1941 * Lets find out if we can use the mbuf-compression. 1942 */ 1943 if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) 1944 goto skip_lookup_b; 1945 CURVNET_SET(lc->ifp->if_vnet); 1946 le->inp = tcp_lro_lookup(lc, le); 1947 if (le->inp && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1948 (le->inp->inp_flags2 & INP_FREED))) { 1949 INP_WUNLOCK(le->inp); 1950 le->inp = NULL; 1951 } 1952 if (le->inp) { 1953 if ((need_flush == 1) && 1954 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && 1955 ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { 1956 /* 1957 * For MBUF queuing or ACKCMP we can accept ECE and CWR 1958 * since each packet is sent to the transport (or the 1959 * compressed state including the ECN bits). 1960 */ 1961 need_flush = 0; 1962 } 1963 locked = 1; 1964 if (le->inp->inp_flags2 & INP_MBUF_ACKCMP) 1965 le->flags |= CAN_USE_ACKCMP; 1966 if ((le->flags & CAN_USE_ACKCMP) || 1967 (le->inp && 1968 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { 1969 m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); 1970 if (m == NULL) { 1971 /* Bad csum, accounting already done */ 1972 INP_WUNLOCK(le->inp); 1973 le->inp = NULL; 1974 return (0); 1975 } 1976 le->strip_cnt++; 1977 } 1978 in_pcbref(le->inp); 1979 } 1980 CURVNET_RESTORE(); 1981 if ((need_flush == 0) && 1982 (th->th_flags & TH_ACK) && 1983 (tcp_data_len == 0) && 1984 (le->flags & CAN_USE_ACKCMP)) { 1985 /* Ok this is a pure ack lets build our special COMPRESS mbuf */ 1986 struct mbuf *nm; 1987 struct tcp_ackent *ack_ent; 1988 1989 /* Question what is going on with the last mbuf on the inp queue, can we use it? */ 1990 INP_WLOCK_ASSERT(le->inp); 1991 nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); 1992 if (nm) { 1993 int idx; 1994 1995 /* We can add in to the one on the tail */ 1996 ack_ent = mtod(nm, struct tcp_ackent *); 1997 idx = (nm->m_len / sizeof(struct tcp_ackent)); 1998 build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1999 nm->m_len += sizeof(struct tcp_ackent); 2000 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 2001 le->ack_seq = th->th_ack; 2002 le->window = th->th_win; 2003 m_freem(m); 2004 counter_u64_add(tcp_extra_mbuf, 1); 2005 le->m_head = NULL; 2006 le->m_tail = NULL; 2007 le->m_last_mbuf = NULL; 2008 le->m_prev_last = NULL; 2009 INP_WUNLOCK(le->inp); 2010 return (0); 2011 } else { 2012 if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) 2013 nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); 2014 else { 2015 nm = m_gethdr(M_NOWAIT, MT_DATA); 2016 nm->m_flags |= M_ACKCMP; 2017 } 2018 if (nm) { 2019 nm->m_pkthdr.rcvif = lc->ifp; 2020 ack_ent = mtod(nm, struct tcp_ackent *); 2021 build_ack_entry(ack_ent, th, m, hdr_len, iptos); 2022 m_freem(m); 2023 m = nm; 2024 m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); 2025 le->flags |= HAS_COMP_ENTRIES; 2026 le->cmp_ack_cnt++; 2027 } 2028 } 2029 } 2030 if (m->m_flags & M_ACKCMP) { 2031 counter_u64_add(tcp_comp_total, 1); 2032 } else { 2033 counter_u64_add(tcp_uncomp_total, 1); 2034 } 2035 skip_lookup_b: 2036 #endif 2037 if (need_flush) 2038 le->need_wakeup = need_flush; 2039 else 2040 le->need_wakeup = 0; 2041 m->m_nextpkt = NULL; 2042 le->m_head = m; 2043 le->m_tail = m_last(m); 2044 le->m_last_mbuf = m; 2045 le->m_prev_last = NULL; 2046 /* 2047 * We keep the total size here for cross checking when we may need 2048 * to flush/wakeup in the MBUF_QUEUE case. 2049 */ 2050 le->tcp_tot_p_len = tcp_data_len; 2051 if (locked) 2052 INP_WUNLOCK(le->inp); 2053 return (0); 2054 } 2055 2056 int 2057 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 2058 { 2059 2060 return tcp_lro_rx2(lc, m, csum, 1); 2061 } 2062 2063 void 2064 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) 2065 { 2066 struct timespec arrv; 2067 2068 /* sanity checks */ 2069 if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || 2070 lc->lro_mbuf_max == 0)) { 2071 /* packet drop */ 2072 m_freem(mb); 2073 return; 2074 } 2075 2076 /* check if packet is not LRO capable */ 2077 if (__predict_false(mb->m_pkthdr.csum_flags == 0 || 2078 (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { 2079 /* input packet to network layer */ 2080 (*lc->ifp->if_input) (lc->ifp, mb); 2081 return; 2082 } 2083 /* Arrival Stamp the packet */ 2084 2085 if ((mb->m_flags & M_TSTMP) == 0) { 2086 /* If no hardware or arrival stamp on the packet add arrival */ 2087 nanouptime(&arrv); 2088 mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) + 2089 arrv.tv_nsec); 2090 mb->m_flags |= M_TSTMP_LRO; 2091 } 2092 /* create sequence number */ 2093 lc->lro_mbuf_data[lc->lro_mbuf_count].seq = 2094 (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | 2095 (((uint64_t)mb->m_pkthdr.flowid) << 24) | 2096 ((uint64_t)lc->lro_mbuf_count); 2097 2098 /* enter mbuf */ 2099 lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; 2100 2101 /* flush if array is full */ 2102 if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) 2103 tcp_lro_flush_all(lc); 2104 } 2105 2106 /* end */ 2107