1 /*- 2 * Copyright (c) 2016-2018 Netflix, Inc. 3 * Copyright (c) 2016-2021 Mellanox Technologies. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 #include <sys/cdefs.h> 28 #include "opt_inet.h" 29 #include "opt_inet6.h" 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/kernel.h> 34 #include <sys/malloc.h> 35 #include <sys/mbuf.h> 36 #include <sys/socket.h> 37 #include <sys/socketvar.h> 38 #include <sys/sysctl.h> 39 40 #include <net/if.h> 41 #include <net/if_var.h> 42 #include <net/ethernet.h> 43 #include <net/bpf.h> 44 #include <net/vnet.h> 45 #include <net/if_dl.h> 46 #include <net/if_media.h> 47 #include <net/if_types.h> 48 #include <net/infiniband.h> 49 #include <net/if_lagg.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip6.h> 53 #include <netinet/ip.h> 54 #include <netinet/ip_var.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet6/in6_pcb.h> 57 #include <netinet/tcp.h> 58 #include <netinet/tcp_lro.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/tcp_hpts.h> 61 #include <netinet/tcp_log_buf.h> 62 63 static void 64 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, 65 uint32_t *ts_ptr, uint16_t iptos) 66 { 67 /* 68 * Given a TCP ACK, summarize it down into the small TCP ACK 69 * entry. 70 */ 71 ae->timestamp = m->m_pkthdr.rcv_tstmp; 72 ae->flags = 0; 73 if (m->m_flags & M_TSTMP_LRO) 74 ae->flags |= TSTMP_LRO; 75 else if (m->m_flags & M_TSTMP) 76 ae->flags |= TSTMP_HDWR; 77 ae->seq = ntohl(th->th_seq); 78 ae->ack = ntohl(th->th_ack); 79 ae->flags |= tcp_get_flags(th); 80 if (ts_ptr != NULL) { 81 ae->ts_value = ntohl(ts_ptr[1]); 82 ae->ts_echo = ntohl(ts_ptr[2]); 83 ae->flags |= HAS_TSTMP; 84 } 85 ae->win = ntohs(th->th_win); 86 ae->codepoint = iptos; 87 } 88 89 static inline bool 90 tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts) 91 { 92 /* 93 * This function returns two bits of valuable information. 94 * a) Is what is present capable of being ack-compressed, 95 * we can ack-compress if there is no options or just 96 * a timestamp option, and of course the th_flags must 97 * be correct as well. 98 * b) Our other options present such as SACK. This is 99 * used to determine if we want to wakeup or not. 100 */ 101 bool ret = true; 102 103 switch (th->th_off << 2) { 104 case (sizeof(*th) + TCPOLEN_TSTAMP_APPA): 105 *ppts = (uint32_t *)(th + 1); 106 /* Check if we have only one timestamp option. */ 107 if (**ppts == TCP_LRO_TS_OPTION) 108 *other_opts = false; 109 else { 110 *other_opts = true; 111 ret = false; 112 } 113 break; 114 case (sizeof(*th)): 115 /* No options. */ 116 *ppts = NULL; 117 *other_opts = false; 118 break; 119 default: 120 *ppts = NULL; 121 *other_opts = true; 122 ret = false; 123 break; 124 } 125 /* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */ 126 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0) 127 ret = false; 128 /* If it has data on it we cannot compress it */ 129 if (m->m_pkthdr.lro_tcp_d_len) 130 ret = false; 131 132 /* ACK flag must be set. */ 133 if (!(tcp_get_flags(th) & TH_ACK)) 134 ret = false; 135 return (ret); 136 } 137 138 static bool 139 tcp_lro_check_wake_status(struct tcpcb *tp) 140 { 141 142 if (tp->t_fb->tfb_early_wake_check != NULL) 143 return ((tp->t_fb->tfb_early_wake_check)(tp)); 144 return (false); 145 } 146 147 static void 148 tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc, 149 const struct lro_entry *le, const struct mbuf *m, 150 int frm, int32_t tcp_data_len, uint32_t th_seq, 151 uint32_t th_ack, uint16_t th_win) 152 { 153 if (tcp_bblogging_on(tp)) { 154 union tcp_log_stackspecific log; 155 struct timeval tv, btv; 156 uint32_t cts; 157 158 cts = tcp_get_usecs(&tv); 159 memset(&log, 0, sizeof(union tcp_log_stackspecific)); 160 log.u_bbr.flex8 = frm; 161 log.u_bbr.flex1 = tcp_data_len; 162 if (m) 163 log.u_bbr.flex2 = m->m_pkthdr.len; 164 else 165 log.u_bbr.flex2 = 0; 166 if (le->m_head) { 167 log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs; 168 log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len; 169 log.u_bbr.flex5 = le->m_head->m_pkthdr.len; 170 log.u_bbr.delRate = le->m_head->m_flags; 171 log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; 172 } 173 log.u_bbr.inflight = th_seq; 174 log.u_bbr.delivered = th_ack; 175 log.u_bbr.timeStamp = cts; 176 log.u_bbr.epoch = le->next_seq; 177 log.u_bbr.lt_epoch = le->ack_seq; 178 log.u_bbr.pacing_gain = th_win; 179 log.u_bbr.cwnd_gain = le->window; 180 log.u_bbr.lost = curcpu; 181 log.u_bbr.cur_del_rate = (uintptr_t)m; 182 log.u_bbr.bw_inuse = (uintptr_t)le->m_head; 183 bintime2timeval(&lc->lro_last_queue_time, &btv); 184 log.u_bbr.flex6 = tcp_tv_to_usectick(&btv); 185 log.u_bbr.flex7 = le->compressed; 186 log.u_bbr.pacing_gain = le->uncompressed; 187 if (in_epoch(net_epoch_preempt)) 188 log.u_bbr.inhpts = 1; 189 else 190 log.u_bbr.inhpts = 0; 191 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 192 &tptosocket(tp)->so_snd, 193 TCP_LOG_LRO, 0, 0, &log, false, &tv); 194 } 195 } 196 197 static struct mbuf * 198 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, 199 struct tcpcb *tp, int32_t *new_m, bool can_append_old_cmp) 200 { 201 struct mbuf *m; 202 203 /* Look at the last mbuf if any in queue */ 204 if (can_append_old_cmp) { 205 m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt); 206 if (m != NULL && (m->m_flags & M_ACKCMP) != 0) { 207 if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) { 208 tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); 209 *new_m = 0; 210 counter_u64_add(tcp_extra_mbuf, 1); 211 return (m); 212 } else { 213 /* Mark we ran out of space */ 214 tp->t_flags2 |= TF2_MBUF_L_ACKS; 215 } 216 } 217 } 218 /* Decide mbuf size. */ 219 tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0); 220 if (tp->t_flags2 & TF2_MBUF_L_ACKS) 221 m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR); 222 else 223 m = m_gethdr(M_NOWAIT, MT_DATA); 224 225 if (__predict_false(m == NULL)) { 226 counter_u64_add(tcp_would_have_but, 1); 227 return (NULL); 228 } 229 counter_u64_add(tcp_comp_total, 1); 230 m->m_pkthdr.rcvif = lc->ifp; 231 m->m_flags |= M_ACKCMP; 232 *new_m = 1; 233 return (m); 234 } 235 236 /* 237 * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets 238 * and strip all, but the IPv4/IPv6 header. 239 */ 240 static bool 241 do_bpf_strip_and_compress(struct tcpcb *tp, struct lro_ctrl *lc, 242 struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp, 243 struct mbuf **mv_to, bool *should_wake, bool bpf_req, bool lagg_bpf_req, 244 struct ifnet *lagg_ifp, bool can_append_old_cmp) 245 { 246 union { 247 void *ptr; 248 struct ip *ip4; 249 struct ip6_hdr *ip6; 250 } l3; 251 struct mbuf *m; 252 struct mbuf *nm; 253 struct tcphdr *th; 254 struct tcp_ackent *ack_ent; 255 uint32_t *ts_ptr; 256 int32_t n_mbuf; 257 bool other_opts, can_compress; 258 uint8_t lro_type; 259 uint16_t iptos; 260 int tcp_hdr_offset; 261 int idx; 262 263 /* Get current mbuf. */ 264 m = *pp; 265 266 /* Let the BPF see the packet */ 267 if (__predict_false(bpf_req)) 268 ETHER_BPF_MTAP(lc->ifp, m); 269 270 if (__predict_false(lagg_bpf_req)) 271 ETHER_BPF_MTAP(lagg_ifp, m); 272 273 tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off; 274 lro_type = le->inner.data.lro_type; 275 switch (lro_type) { 276 case LRO_TYPE_NONE: 277 lro_type = le->outer.data.lro_type; 278 switch (lro_type) { 279 case LRO_TYPE_IPV4_TCP: 280 tcp_hdr_offset -= sizeof(*le->outer.ip4); 281 m->m_pkthdr.lro_etype = ETHERTYPE_IP; 282 break; 283 case LRO_TYPE_IPV6_TCP: 284 tcp_hdr_offset -= sizeof(*le->outer.ip6); 285 m->m_pkthdr.lro_etype = ETHERTYPE_IPV6; 286 break; 287 default: 288 goto compressed; 289 } 290 break; 291 case LRO_TYPE_IPV4_TCP: 292 tcp_hdr_offset -= sizeof(*le->outer.ip4); 293 m->m_pkthdr.lro_etype = ETHERTYPE_IP; 294 break; 295 case LRO_TYPE_IPV6_TCP: 296 tcp_hdr_offset -= sizeof(*le->outer.ip6); 297 m->m_pkthdr.lro_etype = ETHERTYPE_IPV6; 298 break; 299 default: 300 goto compressed; 301 } 302 303 MPASS(tcp_hdr_offset >= 0); 304 305 m_adj(m, tcp_hdr_offset); 306 m->m_flags |= M_LRO_EHDRSTRP; 307 m->m_flags &= ~M_ACKCMP; 308 m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset; 309 310 th = tcp_lro_get_th(m); 311 312 th->th_sum = 0; /* TCP checksum is valid. */ 313 314 /* Check if ACK can be compressed */ 315 can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts); 316 317 /* Now lets look at the should wake states */ 318 if ((other_opts == true) && 319 ((tp->t_flags2 & TF2_DONT_SACK_QUEUE) == 0)) { 320 /* 321 * If there are other options (SACK?) and the 322 * tcp endpoint has not expressly told us it does 323 * not care about SACKS, then we should wake up. 324 */ 325 *should_wake = true; 326 } else if (*should_wake == false) { 327 /* Wakeup override check if we are false here */ 328 *should_wake = tcp_lro_check_wake_status(tp); 329 } 330 /* Is the ack compressable? */ 331 if (can_compress == false) 332 goto done; 333 /* Does the TCP endpoint support ACK compression? */ 334 if ((tp->t_flags2 & TF2_MBUF_ACKCMP) == 0) 335 goto done; 336 337 /* Lets get the TOS/traffic class field */ 338 l3.ptr = mtod(m, void *); 339 switch (lro_type) { 340 case LRO_TYPE_IPV4_TCP: 341 iptos = l3.ip4->ip_tos; 342 break; 343 case LRO_TYPE_IPV6_TCP: 344 iptos = IPV6_TRAFFIC_CLASS(l3.ip6); 345 break; 346 default: 347 iptos = 0; /* Keep compiler happy. */ 348 break; 349 } 350 /* Now lets get space if we don't have some already */ 351 if (*cmp == NULL) { 352 new_one: 353 nm = tcp_lro_get_last_if_ackcmp(lc, le, tp, &n_mbuf, 354 can_append_old_cmp); 355 if (__predict_false(nm == NULL)) 356 goto done; 357 *cmp = nm; 358 if (n_mbuf) { 359 /* 360 * Link in the new cmp ack to our in-order place, 361 * first set our cmp ack's next to where we are. 362 */ 363 nm->m_nextpkt = m; 364 (*pp) = nm; 365 /* 366 * Set it up so mv_to is advanced to our 367 * compressed ack. This way the caller can 368 * advance pp to the right place. 369 */ 370 *mv_to = nm; 371 /* 372 * Advance it here locally as well. 373 */ 374 pp = &nm->m_nextpkt; 375 } 376 } else { 377 /* We have one already we are working on */ 378 nm = *cmp; 379 if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { 380 /* We ran out of space */ 381 tp->t_flags2 |= TF2_MBUF_L_ACKS; 382 goto new_one; 383 } 384 } 385 MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent)); 386 counter_u64_add(tcp_inp_lro_compressed, 1); 387 le->compressed++; 388 /* We can add in to the one on the tail */ 389 ack_ent = mtod(nm, struct tcp_ackent *); 390 idx = (nm->m_len / sizeof(struct tcp_ackent)); 391 build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos); 392 393 /* Bump the size of both pkt-hdr and len */ 394 nm->m_len += sizeof(struct tcp_ackent); 395 nm->m_pkthdr.len += sizeof(struct tcp_ackent); 396 compressed: 397 /* Advance to next mbuf before freeing. */ 398 *pp = m->m_nextpkt; 399 m->m_nextpkt = NULL; 400 m_freem(m); 401 return (true); 402 done: 403 counter_u64_add(tcp_uncomp_total, 1); 404 le->uncompressed++; 405 return (false); 406 } 407 408 static void 409 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) 410 { 411 412 INP_WLOCK_ASSERT(tptoinpcb(tp)); 413 414 STAILQ_HEAD(, mbuf) q = { le->m_head, 415 &STAILQ_NEXT(le->m_last_mbuf, m_stailqpkt) }; 416 STAILQ_CONCAT(&tp->t_inqueue, &q); 417 le->m_head = NULL; 418 le->m_last_mbuf = NULL; 419 } 420 421 static struct tcpcb * 422 tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa) 423 { 424 struct inpcb *inp; 425 426 CURVNET_SET(ifp->if_vnet); 427 switch (pa->data.lro_type) { 428 #ifdef INET6 429 case LRO_TYPE_IPV6_TCP: 430 inp = in6_pcblookup(&V_tcbinfo, 431 &pa->data.s_addr.v6, 432 pa->data.s_port, 433 &pa->data.d_addr.v6, 434 pa->data.d_port, 435 INPLOOKUP_WLOCKPCB, 436 ifp); 437 break; 438 #endif 439 #ifdef INET 440 case LRO_TYPE_IPV4_TCP: 441 inp = in_pcblookup(&V_tcbinfo, 442 pa->data.s_addr.v4, 443 pa->data.s_port, 444 pa->data.d_addr.v4, 445 pa->data.d_port, 446 INPLOOKUP_WLOCKPCB, 447 ifp); 448 break; 449 #endif 450 default: 451 CURVNET_RESTORE(); 452 return (NULL); 453 } 454 CURVNET_RESTORE(); 455 456 return (intotcpcb(inp)); 457 } 458 459 static int 460 _tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) 461 { 462 struct tcpcb *tp; 463 struct mbuf **pp, *cmp, *mv_to; 464 struct ifnet *lagg_ifp; 465 bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp; 466 467 /* Check if packet doesn't belongs to our network interface. */ 468 if ((tcplro_stacks_wanting_mbufq == 0) || 469 (le->outer.data.vlan_id != 0) || 470 (le->inner.data.lro_type != LRO_TYPE_NONE)) 471 return (TCP_LRO_CANNOT); 472 473 #ifdef INET6 474 /* 475 * Be proactive about unspecified IPv6 address in source. As 476 * we use all-zero to indicate unbounded/unconnected pcb, 477 * unspecified IPv6 address can be used to confuse us. 478 * 479 * Note that packets with unspecified IPv6 destination is 480 * already dropped in ip6_input. 481 */ 482 if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP && 483 IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6))) 484 return (TCP_LRO_CANNOT); 485 486 if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP && 487 IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6))) 488 return (TCP_LRO_CANNOT); 489 #endif 490 /* Lookup inp, if any. Returns locked TCP inpcb. */ 491 tp = tcp_lro_lookup(lc->ifp, 492 (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner); 493 if (tp == NULL) 494 return (TCP_LRO_CANNOT); 495 496 counter_u64_add(tcp_inp_lro_locks_taken, 1); 497 498 /* Check if the inp is dead, Jim. */ 499 if (tp->t_state == TCPS_TIME_WAIT) { 500 INP_WUNLOCK(tptoinpcb(tp)); 501 return (TCP_LRO_CANNOT); 502 } 503 if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1) 504 tp->t_lro_cpu = lc->lro_last_cpu; 505 /* Check if the transport doesn't support the needed optimizations. */ 506 if ((tp->t_flags2 & (TF2_SUPPORTS_MBUFQ | TF2_MBUF_ACKCMP)) == 0) { 507 INP_WUNLOCK(tptoinpcb(tp)); 508 return (TCP_LRO_CANNOT); 509 } 510 511 if (tp->t_flags2 & TF2_MBUF_QUEUE_READY) 512 should_wake = false; 513 else 514 should_wake = true; 515 /* Check if packets should be tapped to BPF. */ 516 bpf_req = bpf_peers_present(lc->ifp->if_bpf); 517 lagg_bpf_req = false; 518 lagg_ifp = NULL; 519 if (lc->ifp->if_type == IFT_IEEE8023ADLAG || 520 lc->ifp->if_type == IFT_INFINIBANDLAG) { 521 struct lagg_port *lp = lc->ifp->if_lagg; 522 struct lagg_softc *sc = lp->lp_softc; 523 524 lagg_ifp = sc->sc_ifp; 525 if (lagg_ifp != NULL) 526 lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf); 527 } 528 529 /* Strip and compress all the incoming packets. */ 530 can_append_old_cmp = true; 531 cmp = NULL; 532 for (pp = &le->m_head; *pp != NULL; ) { 533 mv_to = NULL; 534 if (do_bpf_strip_and_compress(tp, lc, le, pp, &cmp, &mv_to, 535 &should_wake, bpf_req, lagg_bpf_req, lagg_ifp, 536 can_append_old_cmp) == false) { 537 /* Advance to next mbuf. */ 538 pp = &(*pp)->m_nextpkt; 539 /* 540 * Once we have appended we can't look in the pending 541 * inbound packets for a compressed ack to append to. 542 */ 543 can_append_old_cmp = false; 544 /* 545 * Once we append we also need to stop adding to any 546 * compressed ack we were remembering. A new cmp 547 * ack will be required. 548 */ 549 cmp = NULL; 550 tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0); 551 } else if (mv_to != NULL) { 552 /* We are asked to move pp up */ 553 pp = &mv_to->m_nextpkt; 554 tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0); 555 } else 556 tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0); 557 } 558 /* Update "m_last_mbuf", if any. */ 559 if (pp == &le->m_head) 560 le->m_last_mbuf = *pp; 561 else 562 le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt); 563 564 /* Check if any data mbufs left. */ 565 if (le->m_head != NULL) { 566 counter_u64_add(tcp_inp_lro_direct_queue, 1); 567 tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1); 568 tcp_queue_pkts(tp, le); 569 } 570 if (should_wake) { 571 /* Wakeup */ 572 counter_u64_add(tcp_inp_lro_wokeup_queue, 1); 573 if ((*tp->t_fb->tfb_do_queued_segments)(tp, 0)) 574 /* TCP cb gone and unlocked. */ 575 return (0); 576 } 577 INP_WUNLOCK(tptoinpcb(tp)); 578 579 return (0); /* Success. */ 580 } 581 582 void 583 tcp_lro_hpts_init(void) 584 { 585 tcp_lro_flush_tcphpts = _tcp_lro_flush_tcphpts; 586 } 587 588 void 589 tcp_lro_hpts_uninit(void) 590 { 591 atomic_store_ptr(&tcp_lro_flush_tcphpts, NULL); 592 } 593