1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007, Myricom Inc. 5 * Copyright (c) 2008, Intel Corporation. 6 * Copyright (c) 2012 The FreeBSD Foundation 7 * Copyright (c) 2016 Mellanox Technologies. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Bjoern Zeeb 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_inet.h" 39 #include "opt_inet6.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sockbuf.h> 49 #include <sys/sysctl.h> 50 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <net/ethernet.h> 54 #include <net/bpf.h> 55 #include <net/vnet.h> 56 57 #include <netinet/in_systm.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/ip.h> 61 #include <netinet/ip_var.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet6/in6_pcb.h> 64 #include <netinet/tcp.h> 65 #include <netinet/tcp_seq.h> 66 #include <netinet/tcp_lro.h> 67 #include <netinet/tcp_var.h> 68 #include <netinet/tcpip.h> 69 #include <netinet/tcp_hpts.h> 70 #include <netinet/tcp_log_buf.h> 71 #include <netinet6/ip6_var.h> 72 73 #include <machine/in_cksum.h> 74 75 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); 76 77 #define TCP_LRO_UPDATE_CSUM 1 78 #ifndef TCP_LRO_UPDATE_CSUM 79 #define TCP_LRO_INVALID_CSUM 0x0000 80 #endif 81 82 static void tcp_lro_rx_done(struct lro_ctrl *lc); 83 static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, 84 uint32_t csum, int use_hash); 85 86 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 87 "TCP LRO"); 88 89 static long tcplro_stacks_wanting_mbufq = 0; 90 counter_u64_t tcp_inp_lro_direct_queue; 91 counter_u64_t tcp_inp_lro_wokeup_queue; 92 counter_u64_t tcp_inp_lro_compressed; 93 counter_u64_t tcp_inp_lro_single_push; 94 counter_u64_t tcp_inp_lro_locks_taken; 95 counter_u64_t tcp_inp_lro_sack_wake; 96 counter_u64_t tcp_extra_mbuf; 97 counter_u64_t tcp_would_have_but; 98 counter_u64_t tcp_comp_total; 99 counter_u64_t tcp_uncomp_total; 100 counter_u64_t tcp_csum_hardware; 101 counter_u64_t tcp_csum_hardware_w_ph; 102 counter_u64_t tcp_csum_software; 103 104 105 static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; 106 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, 107 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, 108 "default number of LRO entries"); 109 110 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, 111 &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); 112 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, 113 &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); 114 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, 115 &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); 116 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD, 117 &tcp_inp_lro_single_push, "Number of lro's sent with single segment"); 118 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, 119 &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); 120 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD, 121 &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin"); 122 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, 123 &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); 124 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, 125 &tcp_would_have_but, "Number of times we would have had an extra compressed but out of room"); 126 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, 127 &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); 128 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, 129 &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); 130 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw, CTLFLAG_RD, 131 &tcp_csum_hardware, "Number of checksums processed in hardware"); 132 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_hw_ph, CTLFLAG_RD, 133 &tcp_csum_hardware_w_ph, "Number of checksums processed in hardware with pseudo header"); 134 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, csum_sw, CTLFLAG_RD, 135 &tcp_csum_software, "Number of checksums processed in software"); 136 137 138 139 void 140 tcp_lro_reg_mbufq(void) 141 { 142 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); 143 } 144 145 void 146 tcp_lro_dereg_mbufq(void) 147 { 148 atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); 149 } 150 151 static __inline void 152 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, 153 struct lro_entry *le) 154 { 155 156 LIST_INSERT_HEAD(&lc->lro_active, le, next); 157 LIST_INSERT_HEAD(bucket, le, hash_next); 158 } 159 160 static __inline void 161 tcp_lro_active_remove(struct lro_entry *le) 162 { 163 164 LIST_REMOVE(le, next); /* active list */ 165 LIST_REMOVE(le, hash_next); /* hash bucket */ 166 } 167 168 int 169 tcp_lro_init(struct lro_ctrl *lc) 170 { 171 return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); 172 } 173 174 int 175 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, 176 unsigned lro_entries, unsigned lro_mbufs) 177 { 178 struct lro_entry *le; 179 size_t size; 180 unsigned i, elements; 181 182 lc->lro_bad_csum = 0; 183 lc->lro_queued = 0; 184 lc->lro_flushed = 0; 185 lc->lro_mbuf_count = 0; 186 lc->lro_mbuf_max = lro_mbufs; 187 lc->lro_cnt = lro_entries; 188 lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; 189 lc->lro_length_lim = TCP_LRO_LENGTH_MAX; 190 lc->ifp = ifp; 191 LIST_INIT(&lc->lro_free); 192 LIST_INIT(&lc->lro_active); 193 194 /* create hash table to accelerate entry lookup */ 195 if (lro_entries > lro_mbufs) 196 elements = lro_entries; 197 else 198 elements = lro_mbufs; 199 lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, 200 HASH_NOWAIT); 201 if (lc->lro_hash == NULL) { 202 memset(lc, 0, sizeof(*lc)); 203 return (ENOMEM); 204 } 205 206 /* compute size to allocate */ 207 size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + 208 (lro_entries * sizeof(*le)); 209 lc->lro_mbuf_data = (struct lro_mbuf_sort *) 210 malloc(size, M_LRO, M_NOWAIT | M_ZERO); 211 212 /* check for out of memory */ 213 if (lc->lro_mbuf_data == NULL) { 214 free(lc->lro_hash, M_LRO); 215 memset(lc, 0, sizeof(*lc)); 216 return (ENOMEM); 217 } 218 /* compute offset for LRO entries */ 219 le = (struct lro_entry *) 220 (lc->lro_mbuf_data + lro_mbufs); 221 222 /* setup linked list */ 223 for (i = 0; i != lro_entries; i++) 224 LIST_INSERT_HEAD(&lc->lro_free, le + i, next); 225 226 return (0); 227 } 228 229 static struct tcphdr * 230 tcp_lro_get_th(struct lro_entry *le, struct mbuf *m) 231 { 232 struct ether_header *eh; 233 struct tcphdr *th = NULL; 234 #ifdef INET6 235 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 236 #endif 237 #ifdef INET 238 struct ip *ip4 = NULL; /* Keep compiler happy. */ 239 #endif 240 241 eh = mtod(m, struct ether_header *); 242 switch (le->eh_type) { 243 #ifdef INET6 244 case ETHERTYPE_IPV6: 245 ip6 = (struct ip6_hdr *)(eh + 1); 246 th = (struct tcphdr *)(ip6 + 1); 247 break; 248 #endif 249 #ifdef INET 250 case ETHERTYPE_IP: 251 ip4 = (struct ip *)(eh + 1); 252 th = (struct tcphdr *)(ip4 + 1); 253 break; 254 #endif 255 } 256 return (th); 257 } 258 259 static void 260 lro_free_mbuf_chain(struct mbuf *m) 261 { 262 struct mbuf *save; 263 264 while (m) { 265 save = m->m_nextpkt; 266 m->m_nextpkt = NULL; 267 m_freem(m); 268 m = save; 269 } 270 } 271 272 void 273 tcp_lro_free(struct lro_ctrl *lc) 274 { 275 struct lro_entry *le; 276 unsigned x; 277 278 /* reset LRO free list */ 279 LIST_INIT(&lc->lro_free); 280 281 /* free active mbufs, if any */ 282 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 283 tcp_lro_active_remove(le); 284 lro_free_mbuf_chain(le->m_head); 285 } 286 287 /* free hash table */ 288 free(lc->lro_hash, M_LRO); 289 lc->lro_hash = NULL; 290 lc->lro_hashsz = 0; 291 292 /* free mbuf array, if any */ 293 for (x = 0; x != lc->lro_mbuf_count; x++) 294 m_freem(lc->lro_mbuf_data[x].mb); 295 lc->lro_mbuf_count = 0; 296 297 /* free allocated memory, if any */ 298 free(lc->lro_mbuf_data, M_LRO); 299 lc->lro_mbuf_data = NULL; 300 } 301 302 static uint16_t 303 tcp_lro_csum_th(struct tcphdr *th) 304 { 305 uint32_t ch; 306 uint16_t *p, l; 307 308 ch = th->th_sum = 0x0000; 309 l = th->th_off; 310 p = (uint16_t *)th; 311 while (l > 0) { 312 ch += *p; 313 p++; 314 ch += *p; 315 p++; 316 l--; 317 } 318 while (ch > 0xffff) 319 ch = (ch >> 16) + (ch & 0xffff); 320 321 return (ch & 0xffff); 322 } 323 324 static uint16_t 325 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, 326 uint16_t tcp_data_len, uint16_t csum) 327 { 328 uint32_t c; 329 uint16_t cs; 330 331 c = csum; 332 333 /* Remove length from checksum. */ 334 switch (le->eh_type) { 335 #ifdef INET6 336 case ETHERTYPE_IPV6: 337 { 338 struct ip6_hdr *ip6; 339 340 ip6 = (struct ip6_hdr *)l3hdr; 341 if (le->append_cnt == 0) 342 cs = ip6->ip6_plen; 343 else { 344 uint32_t cx; 345 346 cx = ntohs(ip6->ip6_plen); 347 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); 348 } 349 break; 350 } 351 #endif 352 #ifdef INET 353 case ETHERTYPE_IP: 354 { 355 struct ip *ip4; 356 357 ip4 = (struct ip *)l3hdr; 358 if (le->append_cnt == 0) 359 cs = ip4->ip_len; 360 else { 361 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), 362 IPPROTO_TCP); 363 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, 364 htons(cs)); 365 } 366 break; 367 } 368 #endif 369 default: 370 cs = 0; /* Keep compiler happy. */ 371 } 372 373 cs = ~cs; 374 c += cs; 375 376 /* Remove TCP header csum. */ 377 cs = ~tcp_lro_csum_th(th); 378 c += cs; 379 while (c > 0xffff) 380 c = (c >> 16) + (c & 0xffff); 381 382 return (c & 0xffff); 383 } 384 385 static void 386 tcp_lro_rx_done(struct lro_ctrl *lc) 387 { 388 struct lro_entry *le; 389 390 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { 391 tcp_lro_active_remove(le); 392 tcp_lro_flush(lc, le); 393 } 394 } 395 396 void 397 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 398 { 399 struct lro_entry *le, *le_tmp; 400 struct timeval tv; 401 402 if (LIST_EMPTY(&lc->lro_active)) 403 return; 404 405 getmicrouptime(&tv); 406 timevalsub(&tv, timeout); 407 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 408 if (timevalcmp(&tv, &le->mtime, >=)) { 409 tcp_lro_active_remove(le); 410 tcp_lro_flush(lc, le); 411 } 412 } 413 } 414 415 #ifdef INET6 416 static int 417 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, 418 struct tcphdr **th) 419 { 420 421 /* XXX-BZ we should check the flow-label. */ 422 423 /* XXX-BZ We do not yet support ext. hdrs. */ 424 if (ip6->ip6_nxt != IPPROTO_TCP) 425 return (TCP_LRO_NOT_SUPPORTED); 426 427 /* Find the TCP header. */ 428 *th = (struct tcphdr *)(ip6 + 1); 429 430 return (0); 431 } 432 #endif 433 434 #ifdef INET 435 static int 436 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, 437 struct tcphdr **th) 438 { 439 int csum_flags; 440 uint16_t csum; 441 442 if (ip4->ip_p != IPPROTO_TCP) 443 return (TCP_LRO_NOT_SUPPORTED); 444 445 /* Ensure there are no options. */ 446 if ((ip4->ip_hl << 2) != sizeof (*ip4)) 447 return (TCP_LRO_CANNOT); 448 449 /* .. and the packet is not fragmented. */ 450 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 451 return (TCP_LRO_CANNOT); 452 453 /* Legacy IP has a header checksum that needs to be correct. */ 454 csum_flags = m->m_pkthdr.csum_flags; 455 if (csum_flags & CSUM_IP_CHECKED) { 456 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 457 lc->lro_bad_csum++; 458 return (TCP_LRO_CANNOT); 459 } 460 } else { 461 csum = in_cksum_hdr(ip4); 462 if (__predict_false((csum) != 0)) { 463 lc->lro_bad_csum++; 464 return (TCP_LRO_CANNOT); 465 } 466 } 467 /* Find the TCP header (we assured there are no IP options). */ 468 *th = (struct tcphdr *)(ip4 + 1); 469 return (0); 470 } 471 #endif 472 473 static void 474 tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc, 475 struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len, 476 uint32_t th_seq , uint32_t th_ack, uint16_t th_win) 477 { 478 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 479 union tcp_log_stackspecific log; 480 struct timeval tv; 481 uint32_t cts; 482 483 cts = tcp_get_usecs(&tv); 484 memset(&log, 0, sizeof(union tcp_log_stackspecific)); 485 log.u_bbr.flex8 = frm; 486 log.u_bbr.flex1 = tcp_data_len; 487 if (m) 488 log.u_bbr.flex2 = m->m_pkthdr.len; 489 else 490 log.u_bbr.flex2 = 0; 491 log.u_bbr.flex3 = le->append_cnt; 492 log.u_bbr.flex4 = le->p_len; 493 if (le->m_head) { 494 log.u_bbr.flex5 = le->m_head->m_pkthdr.len; 495 log.u_bbr.delRate = le->m_head->m_flags; 496 log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; 497 } 498 log.u_bbr.inflight = th_seq; 499 log.u_bbr.timeStamp = cts; 500 log.u_bbr.epoch = le->next_seq; 501 log.u_bbr.delivered = th_ack; 502 log.u_bbr.lt_epoch = le->ack_seq; 503 log.u_bbr.pacing_gain = th_win; 504 log.u_bbr.cwnd_gain = le->window; 505 log.u_bbr.cur_del_rate = (uintptr_t)m; 506 log.u_bbr.bw_inuse = (uintptr_t)le->m_head; 507 log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */ 508 log.u_bbr.applimited = le->ulp_csum; 509 log.u_bbr.lost = le->mbuf_appended; 510 log.u_bbr.pkt_epoch = le->cmp_ack_cnt; 511 log.u_bbr.flex6 = tcp_tv_to_usectick(&lc->lro_last_flush); 512 if (in_epoch(net_epoch_preempt)) 513 log.u_bbr.inhpts = 1; 514 else 515 log.u_bbr.inhpts = 0; 516 TCP_LOG_EVENTP(tp, NULL, 517 &tp->t_inpcb->inp_socket->so_rcv, 518 &tp->t_inpcb->inp_socket->so_snd, 519 TCP_LOG_LRO, 0, 520 0, &log, false, &tv); 521 } 522 } 523 524 static void 525 tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) 526 { 527 if (le->append_cnt > 1) { 528 struct tcphdr *th; 529 uint16_t p_len; 530 531 p_len = htons(le->p_len); 532 switch (le->eh_type) { 533 #ifdef INET6 534 case ETHERTYPE_IPV6: 535 { 536 struct ip6_hdr *ip6; 537 538 ip6 = le->le_ip6; 539 ip6->ip6_plen = p_len; 540 th = (struct tcphdr *)(ip6 + 1); 541 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 542 CSUM_PSEUDO_HDR; 543 le->p_len += ETHER_HDR_LEN + sizeof(*ip6); 544 break; 545 } 546 #endif 547 #ifdef INET 548 case ETHERTYPE_IP: 549 { 550 struct ip *ip4; 551 uint32_t cl; 552 uint16_t c; 553 554 ip4 = le->le_ip4; 555 /* Fix IP header checksum for new length. */ 556 c = ~ip4->ip_sum; 557 cl = c; 558 c = ~ip4->ip_len; 559 cl += c + p_len; 560 while (cl > 0xffff) 561 cl = (cl >> 16) + (cl & 0xffff); 562 c = cl; 563 ip4->ip_sum = ~c; 564 ip4->ip_len = p_len; 565 th = (struct tcphdr *)(ip4 + 1); 566 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 567 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 568 le->p_len += ETHER_HDR_LEN; 569 break; 570 } 571 #endif 572 default: 573 th = NULL; /* Keep compiler happy. */ 574 } 575 le->m_head->m_pkthdr.csum_data = 0xffff; 576 le->m_head->m_pkthdr.len = le->p_len; 577 578 /* Incorporate the latest ACK into the TCP header. */ 579 th->th_ack = le->ack_seq; 580 th->th_win = le->window; 581 /* Incorporate latest timestamp into the TCP header. */ 582 if (le->timestamp != 0) { 583 uint32_t *ts_ptr; 584 585 ts_ptr = (uint32_t *)(th + 1); 586 ts_ptr[1] = htonl(le->tsval); 587 ts_ptr[2] = le->tsecr; 588 } 589 /* Update the TCP header checksum. */ 590 le->ulp_csum += p_len; 591 le->ulp_csum += tcp_lro_csum_th(th); 592 while (le->ulp_csum > 0xffff) 593 le->ulp_csum = (le->ulp_csum >> 16) + 594 (le->ulp_csum & 0xffff); 595 th->th_sum = (le->ulp_csum & 0xffff); 596 th->th_sum = ~th->th_sum; 597 } 598 /* 599 * Break any chain, this is not set to NULL on the singleton 600 * case m_nextpkt points to m_head. Other case set them 601 * m_nextpkt to NULL in push_and_replace. 602 */ 603 le->m_head->m_nextpkt = NULL; 604 le->m_head->m_pkthdr.lro_nsegs = le->append_cnt; 605 (*lc->ifp->if_input)(lc->ifp, le->m_head); 606 lc->lro_queued += le->append_cnt; 607 } 608 609 static void 610 tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 611 { 612 struct ether_header *eh; 613 void *l3hdr = NULL; /* Keep compiler happy. */ 614 struct tcphdr *th; 615 #ifdef INET6 616 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 617 #endif 618 #ifdef INET 619 struct ip *ip4 = NULL; /* Keep compiler happy. */ 620 #endif 621 uint32_t *ts_ptr; 622 int error, l, ts_failed = 0; 623 uint16_t tcp_data_len; 624 uint16_t csum; 625 626 error = -1; 627 eh = mtod(m, struct ether_header *); 628 /* 629 * We must reset the other pointers since the mbuf 630 * we were pointing too is about to go away. 631 */ 632 switch (le->eh_type) { 633 #ifdef INET6 634 case ETHERTYPE_IPV6: 635 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 636 error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 637 le->le_ip6 = ip6; 638 le->source_ip6 = ip6->ip6_src; 639 le->dest_ip6 = ip6->ip6_dst; 640 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 641 break; 642 #endif 643 #ifdef INET 644 case ETHERTYPE_IP: 645 l3hdr = ip4 = (struct ip *)(eh + 1); 646 error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 647 le->le_ip4 = ip4; 648 le->source_ip4 = ip4->ip_src.s_addr; 649 le->dest_ip4 = ip4->ip_dst.s_addr; 650 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 651 break; 652 #endif 653 } 654 KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n", 655 __func__, le)); 656 ts_ptr = (uint32_t *)(th + 1); 657 l = (th->th_off << 2); 658 l -= sizeof(*th); 659 if (l != 0 && 660 (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 661 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 662 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 663 /* We have failed to find a timestamp some other option? */ 664 ts_failed = 1; 665 } 666 if ((l != 0) && (ts_failed == 0)) { 667 le->timestamp = 1; 668 le->tsval = ntohl(*(ts_ptr + 1)); 669 le->tsecr = *(ts_ptr + 2); 670 } else 671 le->timestamp = 0; 672 le->source_port = th->th_sport; 673 le->dest_port = th->th_dport; 674 /* Pull out the csum */ 675 tcp_data_len = m->m_pkthdr.lro_len; 676 le->next_seq = ntohl(th->th_seq) + tcp_data_len; 677 le->ack_seq = th->th_ack; 678 le->window = th->th_win; 679 csum = th->th_sum; 680 /* Setup the data pointers */ 681 le->m_head = m; 682 le->m_tail = m_last(m); 683 le->append_cnt = 0; 684 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 685 ~csum); 686 le->append_cnt++; 687 th->th_sum = csum; /* Restore checksum on first packet. */ 688 } 689 690 static void 691 tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) 692 { 693 /* 694 * Push up the stack the current le and replace 695 * it with m. 696 */ 697 struct mbuf *msave; 698 699 /* Grab off the next and save it */ 700 msave = le->m_head->m_nextpkt; 701 le->m_head->m_nextpkt = NULL; 702 /* Now push out the old le entry */ 703 tcp_flush_out_le(tp, lc, le); 704 /* 705 * Now to replace the data properly in the le 706 * we have to reset the tcp header and 707 * other fields. 708 */ 709 tcp_set_le_to_m(lc, le, m); 710 /* Restore the next list */ 711 m->m_nextpkt = msave; 712 } 713 714 static void 715 tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le) 716 { 717 /* 718 * Walk through the mbuf chain we 719 * have on tap and compress/condense 720 * as required. 721 */ 722 uint32_t *ts_ptr; 723 struct mbuf *m; 724 struct tcphdr *th; 725 uint16_t tcp_data_len, csum_upd; 726 int l; 727 728 /* 729 * First we must check the lead (m_head) 730 * we must make sure that it is *not* 731 * something that should be sent up 732 * right away (sack etc). 733 */ 734 again: 735 736 m = le->m_head->m_nextpkt; 737 if (m == NULL) { 738 /* Just the one left */ 739 return; 740 } 741 if (m->m_flags & M_ACKCMP) 742 panic("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", 743 lc, le, m); 744 th = tcp_lro_get_th(le, le->m_head); 745 KASSERT(th != NULL, 746 ("le:%p m:%p th comes back NULL?", le, le->m_head)); 747 l = (th->th_off << 2); 748 l -= sizeof(*th); 749 ts_ptr = (uint32_t *)(th + 1); 750 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 751 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 752 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 753 /* 754 * Its not the timestamp. We can't 755 * use this guy as the head. 756 */ 757 le->m_head->m_nextpkt = m->m_nextpkt; 758 tcp_push_and_replace(tp, lc, le, m); 759 goto again; 760 } 761 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 762 /* 763 * Make sure that previously seen segements/ACKs are delivered 764 * before this segment, e.g. FIN. 765 */ 766 le->m_head->m_nextpkt = m->m_nextpkt; 767 KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , 768 ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); 769 tcp_push_and_replace(tp, lc, le, m); 770 goto again; 771 } 772 while((m = le->m_head->m_nextpkt) != NULL) { 773 /* 774 * condense m into le, first 775 * pull m out of the list. 776 */ 777 KASSERT(((m->m_flags & M_LRO_EHDRSTRP) == 0) , 778 ("tp:%p mbuf:%p has stripped ethernet flags:0x%x", tp, m, m->m_flags)); 779 KASSERT(((m->m_flags & M_ACKCMP) == 0), 780 ("LRO condense lc:%p le:%p reaches with mbuf:%p ackcmp", lc, le, m)); 781 le->m_head->m_nextpkt = m->m_nextpkt; 782 m->m_nextpkt = NULL; 783 /* Setup my data */ 784 tcp_data_len = m->m_pkthdr.lro_len; 785 th = tcp_lro_get_th(le, m); 786 KASSERT(th != NULL, 787 ("le:%p m:%p th comes back NULL?", le, m)); 788 ts_ptr = (uint32_t *)(th + 1); 789 l = (th->th_off << 2); 790 l -= sizeof(*th); 791 if (le->append_cnt >= lc->lro_ackcnt_lim) { 792 tcp_push_and_replace(tp, lc, le, m); 793 goto again; 794 } 795 if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { 796 /* Flush now if appending will result in overflow. */ 797 tcp_push_and_replace(tp, lc, le, m); 798 goto again; 799 } 800 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 801 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 802 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 803 /* 804 * Maybe a sack in the new one? We need to 805 * start all over after flushing the 806 * current le. We will go up to the beginning 807 * and flush it (calling the replace again possibly 808 * or just returning). 809 */ 810 tcp_push_and_replace(tp, lc, le, m); 811 goto again; 812 } 813 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 814 tcp_push_and_replace(tp, lc, le, m); 815 goto again; 816 } 817 if (l != 0) { 818 uint32_t tsval = ntohl(*(ts_ptr + 1)); 819 /* Make sure timestamp values are increasing. */ 820 if (TSTMP_GT(le->tsval, tsval)) { 821 tcp_push_and_replace(tp, lc, le, m); 822 goto again; 823 } 824 le->tsval = tsval; 825 le->tsecr = *(ts_ptr + 2); 826 } 827 /* Try to append the new segment. */ 828 if (__predict_false(ntohl(th->th_seq) != le->next_seq || 829 (tcp_data_len == 0 && 830 le->ack_seq == th->th_ack && 831 le->window == th->th_win))) { 832 /* Out of order packet or duplicate ACK. */ 833 tcp_push_and_replace(tp, lc, le, m); 834 goto again; 835 } 836 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { 837 le->next_seq += tcp_data_len; 838 le->ack_seq = th->th_ack; 839 le->window = th->th_win; 840 } else if (th->th_ack == le->ack_seq) { 841 le->window = WIN_MAX(le->window, th->th_win); 842 } 843 csum_upd = m->m_pkthdr.lro_csum; 844 le->ulp_csum += csum_upd; 845 if (tcp_data_len == 0) { 846 le->append_cnt++; 847 le->mbuf_cnt--; 848 m_freem(m); 849 continue; 850 } 851 le->append_cnt++; 852 le->mbuf_appended++; 853 le->p_len += tcp_data_len; 854 /* 855 * Adjust the mbuf so that m_data points to the first byte of 856 * the ULP payload. Adjust the mbuf to avoid complications and 857 * append new segment to existing mbuf chain. 858 */ 859 m_adj(m, m->m_pkthdr.len - tcp_data_len); 860 m_demote_pkthdr(m); 861 le->m_tail->m_next = m; 862 le->m_tail = m_last(m); 863 } 864 } 865 866 #ifdef TCPHPTS 867 static void 868 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) 869 { 870 if (tp->t_in_pkt == NULL) { 871 /* Nothing yet there */ 872 tp->t_in_pkt = le->m_head; 873 tp->t_tail_pkt = le->m_last_mbuf; 874 } else { 875 /* Already some there */ 876 tp->t_tail_pkt->m_nextpkt = le->m_head; 877 tp->t_tail_pkt = le->m_last_mbuf; 878 } 879 le->m_head = NULL; 880 le->m_last_mbuf = NULL; 881 } 882 883 static struct mbuf * 884 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, struct inpcb *inp) 885 { 886 struct mbuf *m = NULL; 887 struct tcpcb *tp; 888 889 tp = intotcpcb(inp); 890 if (tp) { 891 /* Look at the last mbuf if any in queue */ 892 if ((tp->t_tail_pkt) && 893 (tp->t_tail_pkt->m_flags & M_ACKCMP)) { 894 if (M_TRAILINGSPACE(tp->t_tail_pkt) >= sizeof(struct tcp_ackent)) { 895 tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); 896 m = tp->t_tail_pkt; 897 } else { 898 if ((inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { 899 counter_u64_add(tcp_would_have_but, 1); 900 inp->inp_flags2 |= INP_MBUF_L_ACKS; 901 } 902 } 903 } 904 } 905 return (m); 906 } 907 908 static struct inpcb * 909 tcp_lro_lookup(struct lro_ctrl *lc, struct lro_entry *le) 910 { 911 struct inpcb *inp = NULL; 912 913 NET_EPOCH_ASSERT(); 914 switch (le->eh_type) { 915 #ifdef INET6 916 case ETHERTYPE_IPV6: 917 inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6, 918 le->source_port, &le->dest_ip6,le->dest_port, 919 INPLOOKUP_WLOCKPCB, 920 lc->ifp); 921 break; 922 #endif 923 #ifdef INET 924 case ETHERTYPE_IP: 925 inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src, 926 le->source_port, le->le_ip4->ip_dst, le->dest_port, 927 INPLOOKUP_WLOCKPCB, 928 lc->ifp); 929 break; 930 #endif 931 } 932 return (inp); 933 } 934 935 #endif 936 #ifdef NO 937 static void 938 stack_guard_prep(uint32_t *sg, int len) 939 { 940 int i; 941 942 for (i = 0; i < len; i++) { 943 sg[i] = 0xdeadc0de; 944 } 945 } 946 947 static void 948 stack_guard_check(struct lro_ctrl *lc, struct lro_entry *le, uint32_t *sg, int len) 949 { 950 int i; 951 952 for (i = 0; i < len; i++) { 953 if (sg[i] != 0xdeadc0de) 954 panic("Stack guard fails sg[%d] = 0x%x le:%p lc:%p sg:%p\n", 955 i, sg[i], le, lc, sg); 956 } 957 } 958 #endif 959 960 void 961 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 962 { 963 struct tcpcb *tp = NULL; 964 #ifdef TCPHPTS 965 struct inpcb *inp = NULL; 966 int need_wakeup = 0, can_queue = 0; 967 968 /* Now lets lookup the inp first */ 969 CURVNET_SET(lc->ifp->if_vnet); 970 /* 971 * XXXRRS Currently the common input handler for 972 * mbuf queuing cannot handle VLAN Tagged. This needs 973 * to be fixed and the or condition removed (i.e. the 974 * common code should do the right lookup for the vlan 975 * tag and anything else that the vlan_input() does). 976 */ 977 if (le->m_head == NULL) { 978 /* 979 * Everything was pushed up to the stack nothing to do 980 * but release the reference and be done. 981 */ 982 if (le->inp) { 983 INP_WLOCK(le->inp); 984 if (in_pcbrele_wlocked(le->inp) == 0) { 985 /* 986 * We released it and still 987 * have the lock. 988 */ 989 INP_WUNLOCK(le->inp); 990 } 991 } 992 goto done; 993 } 994 if ((tcplro_stacks_wanting_mbufq == 0) || (le->m_head->m_flags & M_VLANTAG)) 995 goto skip_lookup; 996 997 if (le->inp == NULL) { 998 le->inp = inp = tcp_lro_lookup(lc, le); 999 if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1000 (inp->inp_flags2 & INP_FREED))) { 1001 /* 1002 * We can't present these to the inp since 1003 * it will not support the stripped ethernet 1004 * header that these have nor if a compressed 1005 * ack is presnet. 1006 */ 1007 INP_WUNLOCK(inp); 1008 lro_free_mbuf_chain(le->m_head); 1009 goto done; 1010 } 1011 if ((le->flags & HAS_COMP_ENTRIES) && 1012 ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { 1013 /* 1014 * It swapped to off, must be a stack 1015 * switch. We need to ditch all the packets 1016 * and the peer will just have to retransmit. 1017 */ 1018 INP_WUNLOCK(inp); 1019 lro_free_mbuf_chain(le->m_head); 1020 goto done; 1021 } 1022 } else { 1023 /* We have a reference on the inp lets lock and release it */ 1024 inp = le->inp; 1025 INP_WLOCK(inp); 1026 if (in_pcbrele_wlocked(inp)) { 1027 /* 1028 * We lost the inp. We can't present these to the inp since 1029 * it will not support the stripped off etherent header. 1030 */ 1031 lro_free_mbuf_chain(le->m_head); 1032 goto done; 1033 } 1034 if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1035 (inp->inp_flags2 & INP_FREED))) { 1036 /* 1037 * We can't present these to the inp since 1038 * it may not support them. 1039 */ 1040 INP_WUNLOCK(inp); 1041 lro_free_mbuf_chain(le->m_head); 1042 goto done; 1043 } 1044 if ((le->flags & HAS_COMP_ENTRIES) && 1045 ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)) { 1046 /* 1047 * It swapped to off, must be a stack 1048 * switch. We need to ditch all the packets 1049 * and the peer will just have to retransmit. 1050 */ 1051 INP_WUNLOCK(inp); 1052 lro_free_mbuf_chain(le->m_head); 1053 goto done; 1054 } 1055 } 1056 if (inp && ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) || 1057 (inp->inp_flags2 & INP_MBUF_ACKCMP))) { 1058 /* The transport supports mbuf queuing */ 1059 can_queue = 1; 1060 if (le->need_wakeup || 1061 ((inp->inp_in_input == 0) && 1062 ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) { 1063 /* 1064 * Either the transport is off on a keep-alive 1065 * (it has the queue_ready flag clear and its 1066 * not already been woken) or the entry has 1067 * some urgent thing (FIN or possibly SACK blocks). 1068 * This means we need to wake the transport up by 1069 * putting it on the input pacer. 1070 */ 1071 need_wakeup = 1; 1072 if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) && 1073 (le->need_wakeup != 1)) { 1074 /* 1075 * Prohibited from a sack wakeup. 1076 */ 1077 need_wakeup = 0; 1078 } 1079 } 1080 /* Do we need to be awoken due to lots of data or acks? */ 1081 if ((le->tcp_tot_p_len >= lc->lro_length_lim) || 1082 (le->mbuf_cnt >= lc->lro_ackcnt_lim)) 1083 need_wakeup = 1; 1084 } 1085 if (inp) 1086 tp = intotcpcb(inp); 1087 else 1088 tp = NULL; 1089 if (can_queue) { 1090 counter_u64_add(tcp_inp_lro_direct_queue, 1); 1091 tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup, 1092 inp->inp_flags2, inp->inp_in_input, le->need_wakeup); 1093 tcp_queue_pkts(tp, le); 1094 if (need_wakeup) { 1095 /* 1096 * We must get the guy to wakeup via 1097 * hpts. 1098 */ 1099 NET_EPOCH_ASSERT(); 1100 if (le->need_wakeup == 2) { 1101 /* 1102 * The value 2 is set if the 1103 * options are unrecognized i.e. 1104 * not just a timestamp. So really 1105 * sack is usually what it is but 1106 * it might be some other option (CWR 1107 * etc). 1108 */ 1109 counter_u64_add(tcp_inp_lro_sack_wake, 1); 1110 } 1111 counter_u64_add(tcp_inp_lro_wokeup_queue, 1); 1112 if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0)) { 1113 inp = NULL; 1114 } 1115 } 1116 } 1117 if (inp) { 1118 /* Unlock it */ 1119 tp = NULL; 1120 counter_u64_add(tcp_inp_lro_locks_taken, 1); 1121 INP_WUNLOCK(inp); 1122 } 1123 if (can_queue == 0) { 1124 skip_lookup: 1125 if (le->strip_cnt) { 1126 /* 1127 * We have stripped mbufs, the connection 1128 * must have changed underneath us. You 1129 * loose the packets as a penalty. 1130 */ 1131 lro_free_mbuf_chain(le->m_head); 1132 goto done; 1133 } 1134 #endif /* TCPHPTS */ 1135 /* Old fashioned lro method */ 1136 if (le->m_head != le->m_last_mbuf) { 1137 counter_u64_add(tcp_inp_lro_compressed, 1); 1138 tcp_lro_condense(tp, lc, le); 1139 } else 1140 counter_u64_add(tcp_inp_lro_single_push, 1); 1141 tcp_flush_out_le(tp, lc, le); 1142 #ifdef TCPHPTS 1143 } 1144 done: 1145 CURVNET_RESTORE(); 1146 #endif 1147 lc->lro_flushed++; 1148 bzero(le, sizeof(*le)); 1149 LIST_INSERT_HEAD(&lc->lro_free, le, next); 1150 } 1151 1152 #ifdef HAVE_INLINE_FLSLL 1153 #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) 1154 #else 1155 static inline uint64_t 1156 tcp_lro_msb_64(uint64_t x) 1157 { 1158 x |= (x >> 1); 1159 x |= (x >> 2); 1160 x |= (x >> 4); 1161 x |= (x >> 8); 1162 x |= (x >> 16); 1163 x |= (x >> 32); 1164 return (x & ~(x >> 1)); 1165 } 1166 #endif 1167 1168 /* 1169 * The tcp_lro_sort() routine is comparable to qsort(), except it has 1170 * a worst case complexity limit of O(MIN(N,64)*N), where N is the 1171 * number of elements to sort and 64 is the number of sequence bits 1172 * available. The algorithm is bit-slicing the 64-bit sequence number, 1173 * sorting one bit at a time from the most significant bit until the 1174 * least significant one, skipping the constant bits. This is 1175 * typically called a radix sort. 1176 */ 1177 static void 1178 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) 1179 { 1180 struct lro_mbuf_sort temp; 1181 uint64_t ones; 1182 uint64_t zeros; 1183 uint32_t x; 1184 uint32_t y; 1185 1186 repeat: 1187 /* for small arrays insertion sort is faster */ 1188 if (size <= 12) { 1189 for (x = 1; x < size; x++) { 1190 temp = parray[x]; 1191 for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) 1192 parray[y] = parray[y - 1]; 1193 parray[y] = temp; 1194 } 1195 return; 1196 } 1197 1198 /* compute sequence bits which are constant */ 1199 ones = 0; 1200 zeros = 0; 1201 for (x = 0; x != size; x++) { 1202 ones |= parray[x].seq; 1203 zeros |= ~parray[x].seq; 1204 } 1205 1206 /* compute bits which are not constant into "ones" */ 1207 ones &= zeros; 1208 if (ones == 0) 1209 return; 1210 1211 /* pick the most significant bit which is not constant */ 1212 ones = tcp_lro_msb_64(ones); 1213 1214 /* 1215 * Move entries having cleared sequence bits to the beginning 1216 * of the array: 1217 */ 1218 for (x = y = 0; y != size; y++) { 1219 /* skip set bits */ 1220 if (parray[y].seq & ones) 1221 continue; 1222 /* swap entries */ 1223 temp = parray[x]; 1224 parray[x] = parray[y]; 1225 parray[y] = temp; 1226 x++; 1227 } 1228 1229 KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); 1230 1231 /* sort zeros */ 1232 tcp_lro_sort(parray, x); 1233 1234 /* sort ones */ 1235 parray += x; 1236 size -= x; 1237 goto repeat; 1238 } 1239 1240 void 1241 tcp_lro_flush_all(struct lro_ctrl *lc) 1242 { 1243 uint64_t seq; 1244 uint64_t nseq; 1245 unsigned x; 1246 1247 /* check if no mbufs to flush */ 1248 if (lc->lro_mbuf_count == 0) 1249 goto done; 1250 1251 microuptime(&lc->lro_last_flush); 1252 /* sort all mbufs according to stream */ 1253 tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); 1254 1255 /* input data into LRO engine, stream by stream */ 1256 seq = 0; 1257 for (x = 0; x != lc->lro_mbuf_count; x++) { 1258 struct mbuf *mb; 1259 1260 /* get mbuf */ 1261 mb = lc->lro_mbuf_data[x].mb; 1262 1263 /* get sequence number, masking away the packet index */ 1264 nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); 1265 1266 /* check for new stream */ 1267 if (seq != nseq) { 1268 seq = nseq; 1269 1270 /* flush active streams */ 1271 tcp_lro_rx_done(lc); 1272 } 1273 1274 /* add packet to LRO engine */ 1275 if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { 1276 /* input packet to network layer */ 1277 (*lc->ifp->if_input)(lc->ifp, mb); 1278 lc->lro_queued++; 1279 lc->lro_flushed++; 1280 } 1281 } 1282 done: 1283 /* flush active streams */ 1284 tcp_lro_rx_done(lc); 1285 1286 lc->lro_mbuf_count = 0; 1287 } 1288 1289 static void 1290 lro_set_mtime(struct timeval *tv, struct timespec *ts) 1291 { 1292 tv->tv_sec = ts->tv_sec; 1293 tv->tv_usec = ts->tv_nsec / 1000; 1294 } 1295 1296 #ifdef TCPHPTS 1297 static void 1298 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, uint16_t hdr_len, uint16_t iptos) 1299 { 1300 /* 1301 * Given a TCP ack, summarize it down into the small tcp 1302 * ack entry. 1303 */ 1304 u_char *cp; 1305 1306 1307 KASSERT(((th->th_flags & ~(TH_ACK | TH_PUSH | TH_CWR | TH_ECE)) == 0), 1308 ("tcphdr:%p mbuf:%p has unallowed bits %x", th, m, th->th_flags)); 1309 ae->timestamp = m->m_pkthdr.rcv_tstmp; 1310 if (m->m_flags & M_TSTMP_LRO) 1311 ae->flags = TSTMP_LRO; 1312 else if (m->m_flags & M_TSTMP) 1313 ae->flags = TSTMP_HDWR; 1314 ae->seq = ntohl(th->th_seq); 1315 ae->ack = ntohl(th->th_ack); 1316 ae->flags |= th->th_flags; 1317 if (hdr_len) { 1318 /* We have a timestamp options get out the bits */ 1319 cp = (u_char *)(th + 1); 1320 /* Skip the two NOP's at the front */ 1321 while (*cp == TCPOPT_NOP) 1322 cp++; 1323 KASSERT(((*cp == TCPOPT_TIMESTAMP) && 1324 (cp[1] == TCPOLEN_TIMESTAMP)), 1325 ("At %p in tcphdr:%p options of %d not timestamp", 1326 cp, th, hdr_len)); 1327 bcopy((char *)cp + 2, 1328 (char *)&ae->ts_value, sizeof(uint32_t)); 1329 ae->ts_value = ntohl(ae->ts_value); 1330 bcopy((char *)cp + 6, 1331 (char *)&ae->ts_echo, sizeof(uint32_t)); 1332 ae->ts_echo = ntohl(ae->ts_echo); 1333 ae->flags |= HAS_TSTMP; 1334 } 1335 ae->win = ntohs(th->th_win); 1336 ae->codepoint = iptos; 1337 } 1338 1339 static struct mbuf * 1340 do_bpf_and_csum(struct inpcb *inp, struct lro_ctrl *lc, struct lro_entry *le, 1341 struct ether_header *eh, struct mbuf *m, int bpf_req, int locked) 1342 { 1343 /* 1344 * Do TCP/IP checksum and BPF tap for either ACK_CMP packets or 1345 * MBUF QUEUE type packets. 1346 */ 1347 struct tcphdr *th; 1348 #ifdef INET6 1349 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 1350 #endif 1351 #ifdef INET 1352 struct ip *ip = NULL; /* Keep compiler happy. */ 1353 #endif 1354 1355 uint16_t drop_hdrlen; 1356 int etype, tlen; 1357 uint8_t iptos; 1358 1359 /* Let the BPF see the packet */ 1360 if (bpf_req && lc->ifp) 1361 ETHER_BPF_MTAP(lc->ifp, m); 1362 /* Get type and Trim off the ethernet header */ 1363 m->m_pkthdr.lro_etype = etype = ntohs(eh->ether_type); 1364 m_adj(m, sizeof(*eh)); 1365 m->m_flags |= M_LRO_EHDRSTRP; 1366 switch (etype) { 1367 #ifdef INET6 1368 case ETHERTYPE_IPV6: 1369 { 1370 if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { 1371 m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); 1372 if (m == NULL) { 1373 TCPSTAT_INC(tcps_rcvshort); 1374 m_freem(m); 1375 return (NULL); 1376 } 1377 } 1378 ip6 = (struct ip6_hdr *)(eh + 1); 1379 th = (struct tcphdr *)(ip6 + 1); 1380 tlen = ntohs(ip6->ip6_plen); 1381 drop_hdrlen = sizeof(*ip6); 1382 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { 1383 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 1384 counter_u64_add(tcp_csum_hardware_w_ph, 1); 1385 th->th_sum = m->m_pkthdr.csum_data; 1386 } else { 1387 counter_u64_add(tcp_csum_hardware, 1); 1388 th->th_sum = in6_cksum_pseudo(ip6, tlen, 1389 IPPROTO_TCP, m->m_pkthdr.csum_data); 1390 } 1391 th->th_sum ^= 0xffff; 1392 } else { 1393 counter_u64_add(tcp_csum_software, 1); 1394 th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); 1395 } 1396 if (th->th_sum) { 1397 TCPSTAT_INC(tcps_rcvbadsum); 1398 if (locked) { 1399 /* Log the bad news */ 1400 struct tcpcb *tp = intotcpcb(inp); 1401 1402 tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); 1403 } 1404 m_freem(m); 1405 return (NULL); 1406 } 1407 1408 1409 1410 1411 /* 1412 * Be proactive about unspecified IPv6 address in source. 1413 * As we use all-zero to indicate unbounded/unconnected pcb, 1414 * unspecified IPv6 address can be used to confuse us. 1415 * 1416 * Note that packets with unspecified IPv6 destination is 1417 * already dropped in ip6_input. 1418 */ 1419 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1420 /* XXX stat */ 1421 m_freem(m); 1422 return (NULL); 1423 } 1424 break; 1425 } 1426 #endif 1427 #ifdef INET 1428 case ETHERTYPE_IP: 1429 { 1430 if (m->m_len < sizeof (struct tcpiphdr)) { 1431 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 1432 == NULL) { 1433 TCPSTAT_INC(tcps_rcvshort); 1434 m_freem(m); 1435 return (NULL); 1436 } 1437 } 1438 ip = (struct ip *)(eh + 1); 1439 th = (struct tcphdr *)(ip + 1); 1440 iptos = ip->ip_tos; 1441 drop_hdrlen = sizeof(*ip); 1442 tlen = ntohs(ip->ip_len) - sizeof(struct ip); 1443 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 1444 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 1445 counter_u64_add(tcp_csum_hardware_w_ph, 1); 1446 th->th_sum = m->m_pkthdr.csum_data; 1447 } else { 1448 counter_u64_add(tcp_csum_hardware, 1); 1449 th->th_sum = in_pseudo(ip->ip_src.s_addr, 1450 ip->ip_dst.s_addr, 1451 htonl(m->m_pkthdr.csum_data + tlen + 1452 IPPROTO_TCP)); 1453 } 1454 th->th_sum ^= 0xffff; 1455 } else { 1456 int len; 1457 struct ipovly *ipov = (struct ipovly *)ip; 1458 /* 1459 * Checksum extended TCP header and data. 1460 */ 1461 counter_u64_add(tcp_csum_software, 1); 1462 len = drop_hdrlen + tlen; 1463 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 1464 ipov->ih_len = htons(tlen); 1465 th->th_sum = in_cksum(m, len); 1466 /* Reset length for SDT probes. */ 1467 ip->ip_len = htons(len); 1468 /* Reset TOS bits */ 1469 ip->ip_tos = iptos; 1470 /* Re-initialization for later version check */ 1471 ip->ip_v = IPVERSION; 1472 ip->ip_hl = sizeof(*ip) >> 2; 1473 } 1474 if (th->th_sum) { 1475 TCPSTAT_INC(tcps_rcvbadsum); 1476 if (locked) { 1477 /* Log the bad news */ 1478 struct tcpcb *tp = intotcpcb(inp); 1479 1480 tcp_lro_log(tp, lc, le, m, 13, tlen, m->m_pkthdr.csum_flags, drop_hdrlen, th->th_sum); 1481 } 1482 m_freem(m); 1483 return (NULL); 1484 } 1485 break; 1486 } 1487 #endif 1488 } /* end switch */ 1489 return (m); 1490 } 1491 #endif 1492 1493 static int 1494 tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) 1495 { 1496 struct lro_entry *le; 1497 struct ether_header *eh; 1498 #ifdef INET6 1499 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 1500 #endif 1501 #ifdef INET 1502 struct ip *ip4 = NULL; /* Keep compiler happy. */ 1503 #endif 1504 struct tcphdr *th; 1505 void *l3hdr = NULL; /* Keep compiler happy. */ 1506 uint32_t *ts_ptr; 1507 tcp_seq seq; 1508 int error, ip_len, hdr_len, locked = 0; 1509 uint16_t eh_type, tcp_data_len, need_flush; 1510 #ifdef TCPHPTS 1511 uint16_t iptos; 1512 #endif 1513 struct lro_head *bucket; 1514 struct timespec arrv; 1515 1516 /* Clear the flags we may use to communicate with TCP */ 1517 m->m_flags &= ~(M_ACKCMP|M_LRO_EHDRSTRP); 1518 1519 /* We expect a contiguous header [eh, ip, tcp]. */ 1520 if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) { 1521 /* If no hardware or arrival stamp on the packet add arrival */ 1522 nanouptime(&arrv); 1523 m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec; 1524 m->m_flags |= M_TSTMP_LRO; 1525 } 1526 eh = mtod(m, struct ether_header *); 1527 eh_type = ntohs(eh->ether_type); 1528 switch (eh_type) { 1529 #ifdef INET6 1530 case ETHERTYPE_IPV6: 1531 { 1532 CURVNET_SET(lc->ifp->if_vnet); 1533 if (V_ip6_forwarding != 0) { 1534 /* XXX-BZ stats but changing lro_ctrl is a problem. */ 1535 CURVNET_RESTORE(); 1536 return (TCP_LRO_CANNOT); 1537 } 1538 CURVNET_RESTORE(); 1539 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 1540 error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 1541 if (error != 0) 1542 return (error); 1543 tcp_data_len = ntohs(ip6->ip6_plen); 1544 #ifdef TCPHPTS 1545 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1546 #endif 1547 ip_len = sizeof(*ip6) + tcp_data_len; 1548 break; 1549 } 1550 #endif 1551 #ifdef INET 1552 case ETHERTYPE_IP: 1553 { 1554 CURVNET_SET(lc->ifp->if_vnet); 1555 if (V_ipforwarding != 0) { 1556 /* XXX-BZ stats but changing lro_ctrl is a problem. */ 1557 CURVNET_RESTORE(); 1558 return (TCP_LRO_CANNOT); 1559 } 1560 CURVNET_RESTORE(); 1561 l3hdr = ip4 = (struct ip *)(eh + 1); 1562 error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 1563 if (error != 0) 1564 return (error); 1565 ip_len = ntohs(ip4->ip_len); 1566 #ifdef TCPHPTS 1567 iptos = ip4->ip_tos; 1568 #endif 1569 tcp_data_len = ip_len - sizeof(*ip4); 1570 break; 1571 } 1572 #endif 1573 /* XXX-BZ what happens in case of VLAN(s)? */ 1574 default: 1575 return (TCP_LRO_NOT_SUPPORTED); 1576 } 1577 1578 /* 1579 * If the frame is padded beyond the end of the IP packet, then we must 1580 * trim the extra bytes off. 1581 */ 1582 hdr_len = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); 1583 if (hdr_len != 0) { 1584 if (hdr_len < 0) 1585 /* Truncated packet. */ 1586 return (TCP_LRO_CANNOT); 1587 1588 m_adj(m, -hdr_len); 1589 } 1590 /* 1591 * Check TCP header constraints. 1592 */ 1593 hdr_len = (th->th_off << 2); 1594 ts_ptr = (uint32_t *)(th + 1); 1595 tcp_data_len -= hdr_len; 1596 hdr_len -= sizeof(*th); 1597 if (th->th_flags & TH_SYN) 1598 return (TCP_LRO_CANNOT); 1599 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 1600 need_flush = 1; 1601 } else 1602 need_flush = 0; 1603 if (hdr_len != 0 && (__predict_false(hdr_len != TCPOLEN_TSTAMP_APPA) || 1604 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 1605 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 1606 /* 1607 * We have an option besides Timestamps, maybe 1608 * it is a sack (most likely) which means we 1609 * will probably need to wake up a sleeper (if 1610 * the guy does queueing). 1611 */ 1612 need_flush = 2; 1613 } 1614 /* If the driver did not pass in the checksum, set it now. */ 1615 if (csum == 0x0000) 1616 csum = th->th_sum; 1617 seq = ntohl(th->th_seq); 1618 if (!use_hash) { 1619 bucket = &lc->lro_hash[0]; 1620 } else if (M_HASHTYPE_ISHASH(m)) { 1621 bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; 1622 } else { 1623 uint32_t hash; 1624 1625 switch (eh_type) { 1626 #ifdef INET 1627 case ETHERTYPE_IP: 1628 hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; 1629 break; 1630 #endif 1631 #ifdef INET6 1632 case ETHERTYPE_IPV6: 1633 hash = ip6->ip6_src.s6_addr32[0] + 1634 ip6->ip6_dst.s6_addr32[0]; 1635 hash += ip6->ip6_src.s6_addr32[1] + 1636 ip6->ip6_dst.s6_addr32[1]; 1637 hash += ip6->ip6_src.s6_addr32[2] + 1638 ip6->ip6_dst.s6_addr32[2]; 1639 hash += ip6->ip6_src.s6_addr32[3] + 1640 ip6->ip6_dst.s6_addr32[3]; 1641 break; 1642 #endif 1643 default: 1644 hash = 0; 1645 break; 1646 } 1647 hash += th->th_sport + th->th_dport; 1648 bucket = &lc->lro_hash[hash % lc->lro_hashsz]; 1649 } 1650 1651 /* Try to find a matching previous segment. */ 1652 LIST_FOREACH(le, bucket, hash_next) { 1653 if (le->eh_type != eh_type) 1654 continue; 1655 if (le->source_port != th->th_sport || 1656 le->dest_port != th->th_dport) 1657 continue; 1658 switch (eh_type) { 1659 #ifdef INET6 1660 case ETHERTYPE_IPV6: 1661 if (bcmp(&le->source_ip6, &ip6->ip6_src, 1662 sizeof(struct in6_addr)) != 0 || 1663 bcmp(&le->dest_ip6, &ip6->ip6_dst, 1664 sizeof(struct in6_addr)) != 0) 1665 continue; 1666 break; 1667 #endif 1668 #ifdef INET 1669 case ETHERTYPE_IP: 1670 if (le->source_ip4 != ip4->ip_src.s_addr || 1671 le->dest_ip4 != ip4->ip_dst.s_addr) 1672 continue; 1673 break; 1674 #endif 1675 } 1676 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) || 1677 (th->th_ack == le->ack_seq)) { 1678 m->m_pkthdr.lro_len = tcp_data_len; 1679 } else { 1680 /* no data and old ack */ 1681 m_freem(m); 1682 return (0); 1683 } 1684 #ifdef TCPHPTS 1685 if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) 1686 goto skip_lookup_a; 1687 if (le->inp == NULL) { 1688 CURVNET_SET(lc->ifp->if_vnet); 1689 le->inp = tcp_lro_lookup(lc, le); 1690 if (le->inp) { 1691 in_pcbref(le->inp); 1692 locked = 1; 1693 } 1694 CURVNET_RESTORE(); 1695 } else if (le->inp) { 1696 INP_WLOCK(le->inp); 1697 locked = 1; 1698 } 1699 if (locked && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1700 (le->inp->inp_flags2 & INP_FREED))) { 1701 /* We can't present these to the inp since 1702 * its dead Jim. 1703 */ 1704 int ret; 1705 1706 ret = in_pcbrele_wlocked(le->inp); 1707 if (ret == 0) 1708 INP_WUNLOCK(le->inp); 1709 le->inp = NULL; 1710 locked = 0; 1711 tcp_lro_active_remove(le); 1712 if (le->strip_cnt && le->m_head) { 1713 /* 1714 * If we have any stripped packets we 1715 * just dump the whole chain. The 1716 * tcp_lro_flush code knows how 1717 * to handle things when le->m_head is NULL 1718 * and even le->inp is NULL. 1719 */ 1720 lro_free_mbuf_chain(le->m_head); 1721 le->m_head = NULL; 1722 } 1723 tcp_lro_flush(lc, le); 1724 return (TCP_LRO_CANNOT); 1725 } 1726 /* See if it has been switched on */ 1727 if (le->inp && (le->inp->inp_flags2 & INP_MBUF_ACKCMP)) 1728 le->flags |= CAN_USE_ACKCMP; 1729 1730 if ((need_flush == 1) && 1731 le->inp && 1732 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && 1733 ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { 1734 /* 1735 * For MBUF queuing or ACKCMP we can accept ECE and CWR 1736 * since each packet is sent to the transport (or the 1737 * compressed state including the ECN bits). 1738 */ 1739 need_flush = 0; 1740 } 1741 skip_lookup_a: 1742 #endif 1743 if (need_flush) 1744 le->need_wakeup = need_flush; 1745 /* Save of the data only csum */ 1746 m->m_pkthdr.rcvif = lc->ifp; 1747 m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, 1748 tcp_data_len, ~csum); 1749 th->th_sum = csum; /* Restore checksum */ 1750 #ifdef TCPHPTS 1751 if ((le->flags & CAN_USE_ACKCMP) || 1752 (le->inp && 1753 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { 1754 /* 1755 * Mbuf queued and ACKCMP packets have their BPF and csum 1756 * done here in LRO. They will still end up looking at the 1757 * headers and such (IP/TCP) but we don't want to proceed 1758 * with any bad csum! 1759 */ 1760 m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); 1761 if (m == NULL) { 1762 /* Bad csum, accounting already done */ 1763 if (locked) { 1764 INP_WUNLOCK(le->inp); 1765 } 1766 return (0); 1767 } 1768 le->strip_cnt++; 1769 } 1770 if ((need_flush == 0) && 1771 (th->th_flags & TH_ACK) && 1772 (tcp_data_len == 0) && 1773 (le->flags & CAN_USE_ACKCMP)) { 1774 /* 1775 * Ok this is a pure ack lets find out if our 1776 * last packet already has one of these. 1777 */ 1778 struct mbuf *nm; 1779 struct tcp_ackent *ack_ent; 1780 int idx; 1781 1782 INP_WLOCK_ASSERT(le->inp); 1783 if (le->m_head == NULL) { 1784 /* Ok can we still use the end of the inp's? */ 1785 nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); 1786 if (nm == NULL) { 1787 /* gone or full */ 1788 goto new_one; 1789 } 1790 /* We can add in to the one on the tail */ 1791 ack_ent = mtod(nm, struct tcp_ackent *); 1792 idx = (nm->m_len / sizeof(struct tcp_ackent)); 1793 build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1794 /* Bump the size of both pkt-hdr and len */ 1795 nm->m_len += sizeof(struct tcp_ackent); 1796 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1797 le->ack_seq = th->th_ack; 1798 le->window = th->th_win; 1799 m_freem(m); 1800 counter_u64_add(tcp_extra_mbuf, 1); 1801 INP_WUNLOCK(le->inp); 1802 return (0); 1803 } else if (le->m_last_mbuf->m_flags & M_ACKCMP) { 1804 /* Yes we might be able to be appended to */ 1805 nm = le->m_last_mbuf; 1806 if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { 1807 if ((le->inp->inp_flags2 & INP_MBUF_L_ACKS) == 0) { 1808 counter_u64_add(tcp_would_have_but, 1); 1809 le->inp->inp_flags2 |= INP_MBUF_L_ACKS; 1810 } 1811 goto new_one; 1812 } 1813 /* we have room */ 1814 ack_ent = mtod(nm, struct tcp_ackent *); 1815 idx = (nm->m_len / sizeof(struct tcp_ackent)); 1816 build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1817 /* Bump the size of both pkt-hdr and len */ 1818 nm->m_len += sizeof(struct tcp_ackent); 1819 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1820 m_freem(m); 1821 le->flags |= HAS_COMP_ENTRIES; 1822 le->cmp_ack_cnt++; 1823 goto compressed; 1824 } else { 1825 /* Nope we need a new one */ 1826 new_one: 1827 if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) 1828 nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); 1829 else { 1830 nm = m_gethdr(M_NOWAIT, MT_DATA); 1831 nm->m_flags |= M_ACKCMP; 1832 } 1833 if (nm) { 1834 nm->m_pkthdr.rcvif = lc->ifp; 1835 ack_ent = mtod(nm, struct tcp_ackent *); 1836 build_ack_entry(ack_ent, th, m, hdr_len, iptos); 1837 m_freem(m); 1838 m = nm; 1839 m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); 1840 le->flags |= HAS_COMP_ENTRIES; 1841 le->cmp_ack_cnt++; 1842 } 1843 /* We fall through and append */ 1844 } 1845 } 1846 if (m->m_flags & M_ACKCMP) { 1847 counter_u64_add(tcp_comp_total, 1); 1848 } else { 1849 counter_u64_add(tcp_uncomp_total, 1); 1850 } 1851 #endif 1852 /* Save off the tail I am appending too (prev) */ 1853 m->m_nextpkt = NULL; 1854 if (le->m_head == NULL) { 1855 /* 1856 * Case where we wer chaining off the inp 1857 * and now no-longer can. 1858 */ 1859 le->m_head = m; 1860 le->m_tail = m_last(m); 1861 le->m_last_mbuf = m; 1862 le->m_prev_last = NULL; 1863 } else { 1864 le->m_prev_last = le->m_last_mbuf; 1865 /* Mark me in the last spot */ 1866 le->m_last_mbuf->m_nextpkt = m; 1867 /* Now set the tail to me */ 1868 le->m_last_mbuf = m; 1869 le->tcp_tot_p_len += tcp_data_len; 1870 } 1871 #ifdef TCPHPTS 1872 compressed: 1873 #endif 1874 le->mbuf_cnt++; 1875 /* Add to the total size of data */ 1876 lro_set_mtime(&le->mtime, &arrv); 1877 if (locked) 1878 INP_WUNLOCK(le->inp); 1879 return (0); 1880 } 1881 /* Try to find an empty slot. */ 1882 if (LIST_EMPTY(&lc->lro_free)) 1883 return (TCP_LRO_NO_ENTRIES); 1884 1885 /* Start a new segment chain. */ 1886 le = LIST_FIRST(&lc->lro_free); 1887 LIST_REMOVE(le, next); 1888 tcp_lro_active_insert(lc, bucket, le); 1889 lro_set_mtime(&le->mtime, &arrv); 1890 1891 /* Start filling in details. */ 1892 switch (eh_type) { 1893 #ifdef INET6 1894 case ETHERTYPE_IPV6: 1895 le->le_ip6 = ip6; 1896 le->source_ip6 = ip6->ip6_src; 1897 le->dest_ip6 = ip6->ip6_dst; 1898 le->eh_type = eh_type; 1899 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 1900 break; 1901 #endif 1902 #ifdef INET 1903 case ETHERTYPE_IP: 1904 le->le_ip4 = ip4; 1905 le->source_ip4 = ip4->ip_src.s_addr; 1906 le->dest_ip4 = ip4->ip_dst.s_addr; 1907 le->eh_type = eh_type; 1908 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 1909 break; 1910 #endif 1911 } 1912 le->source_port = th->th_sport; 1913 le->dest_port = th->th_dport; 1914 le->next_seq = seq + tcp_data_len; 1915 le->ack_seq = th->th_ack; 1916 le->window = th->th_win; 1917 if (hdr_len != 0) { 1918 le->timestamp = 1; 1919 le->tsval = ntohl(*(ts_ptr + 1)); 1920 le->tsecr = *(ts_ptr + 2); 1921 } 1922 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", 1923 __func__, le, le->ulp_csum)); 1924 1925 le->append_cnt = 0; 1926 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 1927 ~csum); 1928 le->append_cnt++; 1929 th->th_sum = csum; /* Restore checksum */ 1930 m->m_pkthdr.rcvif = lc->ifp; 1931 m->m_pkthdr.lro_len = tcp_data_len; 1932 le->mbuf_cnt = 1; 1933 le->cmp_ack_cnt = 0; 1934 le->flags = 0; 1935 #ifdef TCPHPTS 1936 /* 1937 * Lets find out if we can use the mbuf-compression. 1938 */ 1939 if ((tcplro_stacks_wanting_mbufq == 0) || (m->m_flags & M_VLANTAG)) 1940 goto skip_lookup_b; 1941 CURVNET_SET(lc->ifp->if_vnet); 1942 le->inp = tcp_lro_lookup(lc, le); 1943 if (le->inp && ((le->inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || 1944 (le->inp->inp_flags2 & INP_FREED))) { 1945 INP_WUNLOCK(le->inp); 1946 le->inp = NULL; 1947 } 1948 if (le->inp) { 1949 if ((need_flush == 1) && 1950 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)) && 1951 ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) == 0)) { 1952 /* 1953 * For MBUF queuing or ACKCMP we can accept ECE and CWR 1954 * since each packet is sent to the transport (or the 1955 * compressed state including the ECN bits). 1956 */ 1957 need_flush = 0; 1958 } 1959 locked = 1; 1960 if (le->inp->inp_flags2 & INP_MBUF_ACKCMP) 1961 le->flags |= CAN_USE_ACKCMP; 1962 if ((le->flags & CAN_USE_ACKCMP) || 1963 (le->inp && 1964 (le->inp->inp_flags2 & (INP_MBUF_ACKCMP|INP_SUPPORTS_MBUFQ)))) { 1965 m = do_bpf_and_csum(le->inp, lc, le, eh, m, bpf_peers_present(lc->ifp->if_bpf), locked); 1966 if (m == NULL) { 1967 /* Bad csum, accounting already done */ 1968 INP_WUNLOCK(le->inp); 1969 le->inp = NULL; 1970 return (0); 1971 } 1972 le->strip_cnt++; 1973 } 1974 in_pcbref(le->inp); 1975 } 1976 CURVNET_RESTORE(); 1977 if ((need_flush == 0) && 1978 (th->th_flags & TH_ACK) && 1979 (tcp_data_len == 0) && 1980 (le->flags & CAN_USE_ACKCMP)) { 1981 /* Ok this is a pure ack lets build our special COMPRESS mbuf */ 1982 struct mbuf *nm; 1983 struct tcp_ackent *ack_ent; 1984 1985 /* Question what is going on with the last mbuf on the inp queue, can we use it? */ 1986 INP_WLOCK_ASSERT(le->inp); 1987 nm = tcp_lro_get_last_if_ackcmp(lc, le, le->inp); 1988 if (nm) { 1989 int idx; 1990 1991 /* We can add in to the one on the tail */ 1992 ack_ent = mtod(nm, struct tcp_ackent *); 1993 idx = (nm->m_len / sizeof(struct tcp_ackent)); 1994 build_ack_entry(&ack_ent[idx], th, m, hdr_len, iptos); 1995 nm->m_len += sizeof(struct tcp_ackent); 1996 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 1997 le->ack_seq = th->th_ack; 1998 le->window = th->th_win; 1999 m_freem(m); 2000 counter_u64_add(tcp_extra_mbuf, 1); 2001 le->m_head = NULL; 2002 le->m_tail = NULL; 2003 le->m_last_mbuf = NULL; 2004 le->m_prev_last = NULL; 2005 INP_WUNLOCK(le->inp); 2006 return (0); 2007 } else { 2008 if (le->inp->inp_flags2 & INP_MBUF_L_ACKS) 2009 nm = m_getcl(M_NOWAIT, MT_DATA, (M_ACKCMP|M_PKTHDR)); 2010 else { 2011 nm = m_gethdr(M_NOWAIT, MT_DATA); 2012 nm->m_flags |= M_ACKCMP; 2013 } 2014 if (nm) { 2015 nm->m_pkthdr.rcvif = lc->ifp; 2016 ack_ent = mtod(nm, struct tcp_ackent *); 2017 build_ack_entry(ack_ent, th, m, hdr_len, iptos); 2018 m_freem(m); 2019 m = nm; 2020 m->m_pkthdr.len = m->m_len = sizeof(struct tcp_ackent); 2021 le->flags |= HAS_COMP_ENTRIES; 2022 le->cmp_ack_cnt++; 2023 } 2024 } 2025 } 2026 if (m->m_flags & M_ACKCMP) { 2027 counter_u64_add(tcp_comp_total, 1); 2028 } else { 2029 counter_u64_add(tcp_uncomp_total, 1); 2030 } 2031 skip_lookup_b: 2032 #endif 2033 if (need_flush) 2034 le->need_wakeup = need_flush; 2035 else 2036 le->need_wakeup = 0; 2037 m->m_nextpkt = NULL; 2038 le->m_head = m; 2039 le->m_tail = m_last(m); 2040 le->m_last_mbuf = m; 2041 le->m_prev_last = NULL; 2042 /* 2043 * We keep the total size here for cross checking when we may need 2044 * to flush/wakeup in the MBUF_QUEUE case. 2045 */ 2046 le->tcp_tot_p_len = tcp_data_len; 2047 if (locked) 2048 INP_WUNLOCK(le->inp); 2049 return (0); 2050 } 2051 2052 int 2053 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 2054 { 2055 2056 return tcp_lro_rx2(lc, m, csum, 1); 2057 } 2058 2059 void 2060 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) 2061 { 2062 struct timespec arrv; 2063 2064 /* sanity checks */ 2065 if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || 2066 lc->lro_mbuf_max == 0)) { 2067 /* packet drop */ 2068 m_freem(mb); 2069 return; 2070 } 2071 2072 /* check if packet is not LRO capable */ 2073 if (__predict_false(mb->m_pkthdr.csum_flags == 0 || 2074 (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { 2075 /* input packet to network layer */ 2076 (*lc->ifp->if_input) (lc->ifp, mb); 2077 return; 2078 } 2079 /* Arrival Stamp the packet */ 2080 2081 if ((mb->m_flags & M_TSTMP) == 0) { 2082 /* If no hardware or arrival stamp on the packet add arrival */ 2083 nanouptime(&arrv); 2084 mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) + 2085 arrv.tv_nsec); 2086 mb->m_flags |= M_TSTMP_LRO; 2087 } 2088 /* create sequence number */ 2089 lc->lro_mbuf_data[lc->lro_mbuf_count].seq = 2090 (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | 2091 (((uint64_t)mb->m_pkthdr.flowid) << 24) | 2092 ((uint64_t)lc->lro_mbuf_count); 2093 2094 /* enter mbuf */ 2095 lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; 2096 2097 /* flush if array is full */ 2098 if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) 2099 tcp_lro_flush_all(lc); 2100 } 2101 2102 /* end */ 2103