1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * Copyright (c) 2019 Netflix, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the project nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * $KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $ 33 */ 34 35 #include <sys/cdefs.h> 36 #include "opt_rss.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/domain.h> 41 #include <sys/eventhandler.h> 42 #include <sys/hash.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/protosw.h> 47 #include <sys/queue.h> 48 #include <sys/socket.h> 49 #include <sys/sysctl.h> 50 #include <sys/syslog.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <net/if_private.h> 55 #include <net/netisr.h> 56 #include <net/route.h> 57 #include <net/vnet.h> 58 59 #include <netinet/in.h> 60 #include <netinet/in_var.h> 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/in_systm.h> /* For ECN definitions. */ 65 #include <netinet/ip.h> /* For ECN definitions. */ 66 67 #ifdef MAC 68 #include <security/mac/mac_framework.h> 69 #endif 70 71 /* 72 * A "big picture" of how IPv6 fragment queues are all linked together. 73 * 74 * struct ip6qbucket ip6qb[...]; hashed buckets 75 * |||||||| 76 * | 77 * +--- TAILQ(struct ip6q, packets) *q6; tailq entries holding 78 * |||||||| fragmented packets 79 * | (1 per original packet) 80 * | 81 * +--- TAILQ(struct ip6asfrag, ip6q_frags) *af6; tailq entries of IPv6 82 * | *ip6af;fragment packets 83 * | for one original packet 84 * + *mbuf 85 */ 86 87 /* Reassembly headers are stored in hash buckets. */ 88 #define IP6REASS_NHASH_LOG2 10 89 #define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) 90 #define IP6REASS_HMASK (IP6REASS_NHASH - 1) 91 92 TAILQ_HEAD(ip6qhead, ip6q); 93 struct ip6qbucket { 94 struct ip6qhead packets; 95 struct mtx lock; 96 int count; 97 }; 98 99 struct ip6asfrag { 100 TAILQ_ENTRY(ip6asfrag) ip6af_tq; 101 struct mbuf *ip6af_m; 102 int ip6af_offset; /* Offset in ip6af_m to next header. */ 103 int ip6af_frglen; /* Fragmentable part length. */ 104 int ip6af_off; /* Fragment offset. */ 105 bool ip6af_mff; /* More fragment bit in frag off. */ 106 }; 107 108 static MALLOC_DEFINE(M_FRAG6, "frag6", "IPv6 fragment reassembly header"); 109 110 #ifdef VIMAGE 111 /* A flag to indicate if IPv6 fragmentation is initialized. */ 112 VNET_DEFINE_STATIC(bool, frag6_on); 113 #define V_frag6_on VNET(frag6_on) 114 #endif 115 116 /* System wide (global) maximum and count of packets in reassembly queues. */ 117 static int ip6_maxfrags; 118 static u_int __exclusive_cache_line frag6_nfrags; 119 120 /* Maximum and current packets in per-VNET reassembly queue. */ 121 VNET_DEFINE_STATIC(int, ip6_maxfragpackets); 122 VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); 123 #define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) 124 #define V_frag6_nfragpackets VNET(frag6_nfragpackets) 125 126 /* Maximum per-VNET reassembly timeout (milliseconds) */ 127 VNET_DEFINE_STATIC(u_int, ip6_fraglifetime) = IPV6_DEFFRAGTTL; 128 #define V_ip6_fraglifetime VNET(ip6_fraglifetime) 129 130 /* Maximum per-VNET reassembly queues per bucket and fragments per packet. */ 131 VNET_DEFINE_STATIC(int, ip6_maxfragbucketsize); 132 VNET_DEFINE_STATIC(int, ip6_maxfragsperpacket); 133 #define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) 134 #define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) 135 136 /* Per-VNET reassembly queue buckets. */ 137 VNET_DEFINE_STATIC(struct ip6qbucket, ip6qb[IP6REASS_NHASH]); 138 VNET_DEFINE_STATIC(uint32_t, ip6qb_hashseed); 139 #define V_ip6qb VNET(ip6qb) 140 #define V_ip6qb_hashseed VNET(ip6qb_hashseed) 141 142 #define IP6QB_LOCK(_b) mtx_lock(&V_ip6qb[(_b)].lock) 143 #define IP6QB_TRYLOCK(_b) mtx_trylock(&V_ip6qb[(_b)].lock) 144 #define IP6QB_LOCK_ASSERT(_b) mtx_assert(&V_ip6qb[(_b)].lock, MA_OWNED) 145 #define IP6QB_UNLOCK(_b) mtx_unlock(&V_ip6qb[(_b)].lock) 146 #define IP6QB_HEAD(_b) (&V_ip6qb[(_b)].packets) 147 148 /* 149 * By default, limit the number of IP6 fragments across all reassembly 150 * queues to 1/32 of the total number of mbuf clusters. 151 * 152 * Limit the total number of reassembly queues per VNET to the 153 * IP6 fragment limit, but ensure the limit will not allow any bucket 154 * to grow above 100 items. (The bucket limit is 155 * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct 156 * multiplier to reach a 100-item limit.) 157 * The 100-item limit was chosen as brief testing seems to show that 158 * this produces "reasonable" performance on some subset of systems 159 * under DoS attack. 160 */ 161 #define IP6_MAXFRAGS (nmbclusters / 32) 162 #define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) 163 164 /* Interval between periodic reassembly queue inspections */ 165 #define IP6_CALLOUT_INTERVAL_MS 500 166 167 /* 168 * Sysctls and helper function. 169 */ 170 SYSCTL_DECL(_net_inet6_ip6); 171 172 SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfrags, 173 CTLFLAG_RD, &frag6_nfrags, 0, 174 "Global number of IPv6 fragments across all reassembly queues."); 175 176 static void 177 frag6_set_bucketsize(void) 178 { 179 int i; 180 181 if ((i = V_ip6_maxfragpackets) > 0) 182 V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); 183 } 184 185 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, 186 CTLFLAG_RW, &ip6_maxfrags, 0, 187 "Maximum allowed number of outstanding IPv6 packet fragments. " 188 "A value of 0 means no fragmented packets will be accepted, while " 189 "a value of -1 means no limit"); 190 191 static int 192 sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) 193 { 194 int error, val; 195 196 val = V_ip6_maxfragpackets; 197 error = sysctl_handle_int(oidp, &val, 0, req); 198 if (error != 0 || !req->newptr) 199 return (error); 200 V_ip6_maxfragpackets = val; 201 frag6_set_bucketsize(); 202 return (0); 203 } 204 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, 205 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 206 NULL, 0, sysctl_ip6_maxfragpackets, "I", 207 "Default maximum number of outstanding fragmented IPv6 packets. " 208 "A value of 0 means no fragmented packets will be accepted, while a " 209 "a value of -1 means no limit"); 210 SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfragpackets, 211 CTLFLAG_VNET | CTLFLAG_RD, 212 __DEVOLATILE(u_int *, &VNET_NAME(frag6_nfragpackets)), 0, 213 "Per-VNET number of IPv6 fragments across all reassembly queues."); 214 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, 215 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, 216 "Maximum allowed number of fragments per packet"); 217 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, 218 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, 219 "Maximum number of reassembly queues per hash bucket"); 220 221 static int 222 frag6_milli_to_callout_ticks(int ms) 223 { 224 return (ms / IP6_CALLOUT_INTERVAL_MS); 225 } 226 227 static int 228 frag6_callout_ticks_to_milli(int ms) 229 { 230 return (ms * IP6_CALLOUT_INTERVAL_MS); 231 } 232 233 _Static_assert(sizeof(((struct ip6q *)NULL)->ip6q_ttl) >= 2, 234 "ip6q_ttl field is not large enough"); 235 236 static int 237 sysctl_ip6_fraglifetime(SYSCTL_HANDLER_ARGS) 238 { 239 int error, val; 240 241 val = V_ip6_fraglifetime; 242 error = sysctl_handle_int(oidp, &val, 0, req); 243 if (error != 0 || !req->newptr) 244 return (error); 245 if (val <= 0) 246 val = IPV6_DEFFRAGTTL; 247 248 if (frag6_milli_to_callout_ticks(val) >= 65536) 249 val = frag6_callout_ticks_to_milli(65535); 250 #ifdef VIMAGE 251 if (!IS_DEFAULT_VNET(curvnet)) { 252 CURVNET_SET(vnet0); 253 int host_val = V_ip6_fraglifetime; 254 CURVNET_RESTORE(); 255 256 if (val > host_val) 257 val = host_val; 258 } 259 #endif 260 V_ip6_fraglifetime = val; 261 return (0); 262 } 263 SYSCTL_PROC(_net_inet6_ip6, OID_AUTO, fraglifetime_ms, 264 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 265 NULL, 0, sysctl_ip6_fraglifetime, "I", 266 "Fragment lifetime, in milliseconds"); 267 268 /* 269 * Remove the IPv6 fragmentation header from the mbuf. 270 */ 271 int 272 ip6_deletefraghdr(struct mbuf *m, int offset, int wait __unused) 273 { 274 struct ip6_hdr *ip6; 275 276 KASSERT(m->m_len >= offset + sizeof(struct ip6_frag), 277 ("%s: ext headers not contigous in mbuf %p m_len %d >= " 278 "offset %d + %zu\n", __func__, m, m->m_len, offset, 279 sizeof(struct ip6_frag))); 280 281 /* Delete frag6 header. */ 282 ip6 = mtod(m, struct ip6_hdr *); 283 bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), offset); 284 m->m_data += sizeof(struct ip6_frag); 285 m->m_len -= sizeof(struct ip6_frag); 286 m->m_flags |= M_FRAGMENTED; 287 288 return (0); 289 } 290 291 /* 292 * Free a fragment reassembly header and all associated datagrams. 293 */ 294 static void 295 frag6_freef(struct ip6q *q6, uint32_t bucket) 296 { 297 struct ip6_hdr *ip6; 298 struct ip6asfrag *af6; 299 struct mbuf *m; 300 301 IP6QB_LOCK_ASSERT(bucket); 302 303 while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { 304 m = af6->ip6af_m; 305 TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); 306 307 /* 308 * Return ICMP time exceeded error for the 1st fragment. 309 * Just free other fragments. 310 */ 311 if (af6->ip6af_off == 0 && m->m_pkthdr.rcvif != NULL) { 312 /* Adjust pointer. */ 313 ip6 = mtod(m, struct ip6_hdr *); 314 315 /* Restore source and destination addresses. */ 316 ip6->ip6_src = q6->ip6q_src; 317 ip6->ip6_dst = q6->ip6q_dst; 318 319 icmp6_error(m, ICMP6_TIME_EXCEEDED, 320 ICMP6_TIME_EXCEED_REASSEMBLY, 0); 321 } else 322 m_freem(m); 323 324 free(af6, M_FRAG6); 325 } 326 327 TAILQ_REMOVE(IP6QB_HEAD(bucket), q6, ip6q_tq); 328 V_ip6qb[bucket].count--; 329 atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); 330 #ifdef MAC 331 mac_ip6q_destroy(q6); 332 #endif 333 free(q6, M_FRAG6); 334 atomic_subtract_int(&V_frag6_nfragpackets, 1); 335 } 336 337 /* 338 * Drain off all datagram fragments belonging to 339 * the given network interface. 340 */ 341 static void 342 frag6_cleanup(void *arg __unused, struct ifnet *ifp) 343 { 344 struct ip6qhead *head; 345 struct ip6q *q6; 346 struct ip6asfrag *af6; 347 uint32_t bucket; 348 349 KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); 350 351 CURVNET_SET_QUIET(ifp->if_vnet); 352 #ifdef VIMAGE 353 /* 354 * Skip processing if IPv6 reassembly is not initialised or 355 * torn down by frag6_destroy(). 356 */ 357 if (!V_frag6_on) { 358 CURVNET_RESTORE(); 359 return; 360 } 361 #endif 362 363 for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { 364 IP6QB_LOCK(bucket); 365 head = IP6QB_HEAD(bucket); 366 /* Scan fragment list. */ 367 TAILQ_FOREACH(q6, head, ip6q_tq) { 368 TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { 369 /* Clear no longer valid rcvif pointer. */ 370 if (af6->ip6af_m->m_pkthdr.rcvif == ifp) 371 af6->ip6af_m->m_pkthdr.rcvif = NULL; 372 } 373 } 374 IP6QB_UNLOCK(bucket); 375 } 376 CURVNET_RESTORE(); 377 } 378 EVENTHANDLER_DEFINE(ifnet_departure_event, frag6_cleanup, NULL, 0); 379 380 /* 381 * Like in RFC2460, in RFC8200, fragment and reassembly rules do not agree with 382 * each other, in terms of next header field handling in fragment header. 383 * While the sender will use the same value for all of the fragmented packets, 384 * receiver is suggested not to check for consistency. 385 * 386 * Fragment rules (p18,p19): 387 * (2) A Fragment header containing: 388 * The Next Header value that identifies the first header 389 * after the Per-Fragment headers of the original packet. 390 * -> next header field is same for all fragments 391 * 392 * Reassembly rule (p20): 393 * The Next Header field of the last header of the Per-Fragment 394 * headers is obtained from the Next Header field of the first 395 * fragment's Fragment header. 396 * -> should grab it from the first fragment only 397 * 398 * The following note also contradicts with fragment rule - no one is going to 399 * send different fragment with different next header field. 400 * 401 * Additional note (p22) [not an error]: 402 * The Next Header values in the Fragment headers of different 403 * fragments of the same original packet may differ. Only the value 404 * from the Offset zero fragment packet is used for reassembly. 405 * -> should grab it from the first fragment only 406 * 407 * There is no explicit reason given in the RFC. Historical reason maybe? 408 */ 409 /* 410 * Fragment input. 411 */ 412 int 413 frag6_input(struct mbuf **mp, int *offp, int proto) 414 { 415 struct mbuf *m, *t; 416 struct ip6_hdr *ip6; 417 struct ip6_frag *ip6f; 418 struct ip6qhead *head; 419 struct ip6q *q6; 420 struct ip6asfrag *af6, *ip6af, *af6tmp; 421 struct in6_ifaddr *ia6; 422 struct ifnet *dstifp, *srcifp; 423 uint32_t hashkey[(sizeof(struct in6_addr) * 2 + 424 sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)]; 425 uint32_t bucket, *hashkeyp; 426 int fragoff, frgpartlen; /* Must be larger than uint16_t. */ 427 int nxt, offset, plen; 428 uint8_t ecn, ecn0; 429 bool only_frag; 430 #ifdef RSS 431 struct ip6_direct_ctx *ip6dc; 432 struct m_tag *mtag; 433 #endif 434 435 m = *mp; 436 offset = *offp; 437 438 M_ASSERTPKTHDR(m); 439 440 if (m->m_len < offset + sizeof(struct ip6_frag)) { 441 m = m_pullup(m, offset + sizeof(struct ip6_frag)); 442 if (m == NULL) { 443 IP6STAT_INC(ip6s_exthdrtoolong); 444 *mp = NULL; 445 return (IPPROTO_DONE); 446 } 447 } 448 ip6 = mtod(m, struct ip6_hdr *); 449 450 dstifp = NULL; 451 /* Find the destination interface of the packet. */ 452 ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); 453 if (ia6 != NULL) 454 dstifp = ia6->ia_ifp; 455 456 /* Jumbo payload cannot contain a fragment header. */ 457 if (ip6->ip6_plen == 0) { 458 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); 459 in6_ifstat_inc(dstifp, ifs6_reass_fail); 460 *mp = NULL; 461 return (IPPROTO_DONE); 462 } 463 464 /* 465 * Check whether fragment packet's fragment length is a 466 * multiple of 8 octets (unless it is the last one). 467 * sizeof(struct ip6_frag) == 8 468 * sizeof(struct ip6_hdr) = 40 469 */ 470 ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); 471 if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) && 472 (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) { 473 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, 474 offsetof(struct ip6_hdr, ip6_plen)); 475 in6_ifstat_inc(dstifp, ifs6_reass_fail); 476 *mp = NULL; 477 return (IPPROTO_DONE); 478 } 479 480 IP6STAT_INC(ip6s_fragments); 481 in6_ifstat_inc(dstifp, ifs6_reass_reqd); 482 483 /* 484 * Handle "atomic" fragments (offset and m bit set to 0) upfront, 485 * unrelated to any reassembly. We need to remove the frag hdr 486 * which is ugly. 487 * See RFC 6946 and section 4.5 of RFC 8200. 488 */ 489 if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { 490 IP6STAT_INC(ip6s_atomicfrags); 491 nxt = ip6f->ip6f_nxt; 492 /* 493 * Set nxt(-hdr field value) to the original value. 494 * We cannot just set ip6->ip6_nxt as there might be 495 * an unfragmentable part with extension headers and 496 * we must update the last one. 497 */ 498 m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), 499 (caddr_t)&nxt); 500 ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - 501 sizeof(struct ip6_frag)); 502 if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) 503 goto dropfrag2; 504 m->m_pkthdr.len -= sizeof(struct ip6_frag); 505 in6_ifstat_inc(dstifp, ifs6_reass_ok); 506 *mp = m; 507 return (nxt); 508 } 509 510 /* Offset now points to data portion. */ 511 offset += sizeof(struct ip6_frag); 512 513 /* Get fragment length and discard 0-byte fragments. */ 514 frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; 515 if (frgpartlen == 0) { 516 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, 517 offsetof(struct ip6_hdr, ip6_plen)); 518 in6_ifstat_inc(dstifp, ifs6_reass_fail); 519 IP6STAT_INC(ip6s_fragdropped); 520 *mp = NULL; 521 return (IPPROTO_DONE); 522 } 523 524 /* 525 * Enforce upper bound on number of fragments for the entire system. 526 * If maxfrag is 0, never accept fragments. 527 * If maxfrag is -1, accept all fragments without limitation. 528 */ 529 if (ip6_maxfrags < 0) 530 ; 531 else if (atomic_load_int(&frag6_nfrags) >= (u_int)ip6_maxfrags) 532 goto dropfrag2; 533 534 /* 535 * Validate that a full header chain to the ULP is present in the 536 * packet containing the first fragment as per RFC RFC7112 and 537 * RFC 8200 pages 18,19: 538 * The first fragment packet is composed of: 539 * (3) Extension headers, if any, and the Upper-Layer header. These 540 * headers must be in the first fragment. ... 541 */ 542 fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); 543 /* XXX TODO. thj has D16851 open for this. */ 544 /* Send ICMPv6 4,3 in case of violation. */ 545 546 /* Store receive network interface pointer for later. */ 547 srcifp = m->m_pkthdr.rcvif; 548 549 /* Generate a hash value for fragment bucket selection. */ 550 hashkeyp = hashkey; 551 memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); 552 hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); 553 memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); 554 hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); 555 *hashkeyp = ip6f->ip6f_ident; 556 bucket = jenkins_hash32(hashkey, nitems(hashkey), V_ip6qb_hashseed); 557 bucket &= IP6REASS_HMASK; 558 IP6QB_LOCK(bucket); 559 head = IP6QB_HEAD(bucket); 560 561 TAILQ_FOREACH(q6, head, ip6q_tq) 562 if (ip6f->ip6f_ident == q6->ip6q_ident && 563 IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && 564 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) 565 #ifdef MAC 566 && mac_ip6q_match(m, q6) 567 #endif 568 ) 569 break; 570 571 only_frag = false; 572 if (q6 == NULL) { 573 /* A first fragment to arrive creates a reassembly queue. */ 574 only_frag = true; 575 576 /* 577 * Enforce upper bound on number of fragmented packets 578 * for which we attempt reassembly; 579 * If maxfragpackets is 0, never accept fragments. 580 * If maxfragpackets is -1, accept all fragments without 581 * limitation. 582 */ 583 if (V_ip6_maxfragpackets < 0) 584 ; 585 else if (V_ip6qb[bucket].count >= V_ip6_maxfragbucketsize || 586 atomic_load_int(&V_frag6_nfragpackets) >= 587 (u_int)V_ip6_maxfragpackets) 588 goto dropfrag; 589 590 /* Allocate IPv6 fragement packet queue entry. */ 591 q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FRAG6, 592 M_NOWAIT | M_ZERO); 593 if (q6 == NULL) 594 goto dropfrag; 595 #ifdef MAC 596 if (mac_ip6q_init(q6, M_NOWAIT) != 0) { 597 free(q6, M_FRAG6); 598 goto dropfrag; 599 } 600 mac_ip6q_create(m, q6); 601 #endif 602 atomic_add_int(&V_frag6_nfragpackets, 1); 603 604 /* ip6q_nxt will be filled afterwards, from 1st fragment. */ 605 TAILQ_INIT(&q6->ip6q_frags); 606 q6->ip6q_ident = ip6f->ip6f_ident; 607 q6->ip6q_ttl = frag6_milli_to_callout_ticks(V_ip6_fraglifetime); 608 q6->ip6q_src = ip6->ip6_src; 609 q6->ip6q_dst = ip6->ip6_dst; 610 q6->ip6q_ecn = IPV6_ECN(ip6); 611 q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ 612 613 /* Add the fragemented packet to the bucket. */ 614 TAILQ_INSERT_HEAD(head, q6, ip6q_tq); 615 V_ip6qb[bucket].count++; 616 } 617 618 /* 619 * If it is the 1st fragment, record the length of the 620 * unfragmentable part and the next header of the fragment header. 621 * Assume the first 1st fragement to arrive will be correct. 622 * We do not have any duplicate checks here yet so another packet 623 * with fragoff == 0 could come and overwrite the ip6q_unfrglen 624 * and worse, the next header, at any time. 625 */ 626 if (fragoff == 0 && q6->ip6q_unfrglen == -1) { 627 q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) - 628 sizeof(struct ip6_frag); 629 q6->ip6q_nxt = ip6f->ip6f_nxt; 630 /* XXX ECN? */ 631 } 632 633 /* 634 * Check that the reassembled packet would not exceed 65535 bytes 635 * in size. 636 * If it would exceed, discard the fragment and return an ICMP error. 637 */ 638 if (q6->ip6q_unfrglen >= 0) { 639 /* The 1st fragment has already arrived. */ 640 if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { 641 if (only_frag) { 642 TAILQ_REMOVE(head, q6, ip6q_tq); 643 V_ip6qb[bucket].count--; 644 atomic_subtract_int(&V_frag6_nfragpackets, 1); 645 #ifdef MAC 646 mac_ip6q_destroy(q6); 647 #endif 648 free(q6, M_FRAG6); 649 } 650 IP6QB_UNLOCK(bucket); 651 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, 652 offset - sizeof(struct ip6_frag) + 653 offsetof(struct ip6_frag, ip6f_offlg)); 654 *mp = NULL; 655 return (IPPROTO_DONE); 656 } 657 } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { 658 if (only_frag) { 659 TAILQ_REMOVE(head, q6, ip6q_tq); 660 V_ip6qb[bucket].count--; 661 atomic_subtract_int(&V_frag6_nfragpackets, 1); 662 #ifdef MAC 663 mac_ip6q_destroy(q6); 664 #endif 665 free(q6, M_FRAG6); 666 } 667 IP6QB_UNLOCK(bucket); 668 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, 669 offset - sizeof(struct ip6_frag) + 670 offsetof(struct ip6_frag, ip6f_offlg)); 671 *mp = NULL; 672 return (IPPROTO_DONE); 673 } 674 675 /* 676 * If it is the first fragment, do the above check for each 677 * fragment already stored in the reassembly queue. 678 */ 679 if (fragoff == 0 && !only_frag) { 680 TAILQ_FOREACH_SAFE(af6, &q6->ip6q_frags, ip6af_tq, af6tmp) { 681 if (q6->ip6q_unfrglen + af6->ip6af_off + 682 af6->ip6af_frglen > IPV6_MAXPACKET) { 683 struct ip6_hdr *ip6err; 684 struct mbuf *merr; 685 int erroff; 686 687 merr = af6->ip6af_m; 688 erroff = af6->ip6af_offset; 689 690 /* Dequeue the fragment. */ 691 TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); 692 q6->ip6q_nfrag--; 693 atomic_subtract_int(&frag6_nfrags, 1); 694 free(af6, M_FRAG6); 695 696 /* Set a valid receive interface pointer. */ 697 merr->m_pkthdr.rcvif = srcifp; 698 699 /* Adjust pointer. */ 700 ip6err = mtod(merr, struct ip6_hdr *); 701 702 /* 703 * Restore source and destination addresses 704 * in the erroneous IPv6 header. 705 */ 706 ip6err->ip6_src = q6->ip6q_src; 707 ip6err->ip6_dst = q6->ip6q_dst; 708 709 icmp6_error(merr, ICMP6_PARAM_PROB, 710 ICMP6_PARAMPROB_HEADER, 711 erroff - sizeof(struct ip6_frag) + 712 offsetof(struct ip6_frag, ip6f_offlg)); 713 } 714 } 715 } 716 717 /* Allocate an IPv6 fragement queue entry for this fragmented part. */ 718 ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FRAG6, 719 M_NOWAIT | M_ZERO); 720 if (ip6af == NULL) 721 goto dropfrag; 722 ip6af->ip6af_mff = (ip6f->ip6f_offlg & IP6F_MORE_FRAG) ? true : false; 723 ip6af->ip6af_off = fragoff; 724 ip6af->ip6af_frglen = frgpartlen; 725 ip6af->ip6af_offset = offset; 726 ip6af->ip6af_m = m; 727 728 if (only_frag) { 729 /* 730 * Do a manual insert rather than a hard-to-understand cast 731 * to a different type relying on data structure order to work. 732 */ 733 TAILQ_INSERT_HEAD(&q6->ip6q_frags, ip6af, ip6af_tq); 734 goto postinsert; 735 } 736 737 /* Do duplicate, condition, and boundry checks. */ 738 /* 739 * Handle ECN by comparing this segment with the first one; 740 * if CE is set, do not lose CE. 741 * Drop if CE and not-ECT are mixed for the same packet. 742 */ 743 ecn = IPV6_ECN(ip6); 744 ecn0 = q6->ip6q_ecn; 745 if (ecn == IPTOS_ECN_CE) { 746 if (ecn0 == IPTOS_ECN_NOTECT) { 747 free(ip6af, M_FRAG6); 748 goto dropfrag; 749 } 750 if (ecn0 != IPTOS_ECN_CE) 751 q6->ip6q_ecn = IPTOS_ECN_CE; 752 } 753 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { 754 free(ip6af, M_FRAG6); 755 goto dropfrag; 756 } 757 758 /* Find a fragmented part which begins after this one does. */ 759 TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) 760 if (af6->ip6af_off > ip6af->ip6af_off) 761 break; 762 763 /* 764 * If the incoming framgent overlaps some existing fragments in 765 * the reassembly queue, drop both the new fragment and the 766 * entire reassembly queue. However, if the new fragment 767 * is an exact duplicate of an existing fragment, only silently 768 * drop the existing fragment and leave the fragmentation queue 769 * unchanged, as allowed by the RFC. (RFC 8200, 4.5) 770 */ 771 if (af6 != NULL) 772 af6tmp = TAILQ_PREV(af6, ip6fraghead, ip6af_tq); 773 else 774 af6tmp = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); 775 if (af6tmp != NULL) { 776 if (af6tmp->ip6af_off + af6tmp->ip6af_frglen - 777 ip6af->ip6af_off > 0) { 778 if (af6tmp->ip6af_off != ip6af->ip6af_off || 779 af6tmp->ip6af_frglen != ip6af->ip6af_frglen) 780 frag6_freef(q6, bucket); 781 free(ip6af, M_FRAG6); 782 goto dropfrag; 783 } 784 } 785 if (af6 != NULL) { 786 if (ip6af->ip6af_off + ip6af->ip6af_frglen - 787 af6->ip6af_off > 0) { 788 if (af6->ip6af_off != ip6af->ip6af_off || 789 af6->ip6af_frglen != ip6af->ip6af_frglen) 790 frag6_freef(q6, bucket); 791 free(ip6af, M_FRAG6); 792 goto dropfrag; 793 } 794 } 795 796 #ifdef MAC 797 mac_ip6q_update(m, q6); 798 #endif 799 800 /* 801 * Stick new segment in its place; check for complete reassembly. 802 * If not complete, check fragment limit. Move to front of packet 803 * queue, as we are the most recently active fragmented packet. 804 */ 805 if (af6 != NULL) 806 TAILQ_INSERT_BEFORE(af6, ip6af, ip6af_tq); 807 else 808 TAILQ_INSERT_TAIL(&q6->ip6q_frags, ip6af, ip6af_tq); 809 postinsert: 810 atomic_add_int(&frag6_nfrags, 1); 811 q6->ip6q_nfrag++; 812 813 plen = 0; 814 TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { 815 if (af6->ip6af_off != plen) { 816 if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { 817 IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); 818 frag6_freef(q6, bucket); 819 } 820 IP6QB_UNLOCK(bucket); 821 *mp = NULL; 822 return (IPPROTO_DONE); 823 } 824 plen += af6->ip6af_frglen; 825 } 826 af6 = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); 827 if (af6->ip6af_mff) { 828 if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { 829 IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); 830 frag6_freef(q6, bucket); 831 } 832 IP6QB_UNLOCK(bucket); 833 *mp = NULL; 834 return (IPPROTO_DONE); 835 } 836 837 /* Reassembly is complete; concatenate fragments. */ 838 ip6af = TAILQ_FIRST(&q6->ip6q_frags); 839 t = m = ip6af->ip6af_m; 840 TAILQ_REMOVE(&q6->ip6q_frags, ip6af, ip6af_tq); 841 while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { 842 m->m_pkthdr.csum_flags &= 843 af6->ip6af_m->m_pkthdr.csum_flags; 844 m->m_pkthdr.csum_data += 845 af6->ip6af_m->m_pkthdr.csum_data; 846 847 TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); 848 t = m_last(t); 849 m_adj(af6->ip6af_m, af6->ip6af_offset); 850 m_demote_pkthdr(af6->ip6af_m); 851 m_cat(t, af6->ip6af_m); 852 free(af6, M_FRAG6); 853 } 854 855 while (m->m_pkthdr.csum_data & 0xffff0000) 856 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + 857 (m->m_pkthdr.csum_data >> 16); 858 859 /* Adjust offset to point where the original next header starts. */ 860 offset = ip6af->ip6af_offset - sizeof(struct ip6_frag); 861 free(ip6af, M_FRAG6); 862 if ((u_int)plen + (u_int)offset - sizeof(struct ip6_hdr) > 863 IPV6_MAXPACKET) { 864 frag6_freef(q6, bucket); 865 goto dropfrag; 866 } 867 ip6 = mtod(m, struct ip6_hdr *); 868 ip6->ip6_plen = htons((u_short)plen + offset - sizeof(struct ip6_hdr)); 869 if (q6->ip6q_ecn == IPTOS_ECN_CE) 870 ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); 871 nxt = q6->ip6q_nxt; 872 873 TAILQ_REMOVE(head, q6, ip6q_tq); 874 V_ip6qb[bucket].count--; 875 atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); 876 877 ip6_deletefraghdr(m, offset, M_NOWAIT); 878 879 /* Set nxt(-hdr field value) to the original value. */ 880 m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), 881 (caddr_t)&nxt); 882 883 #ifdef MAC 884 mac_ip6q_reassemble(q6, m); 885 mac_ip6q_destroy(q6); 886 #endif 887 free(q6, M_FRAG6); 888 atomic_subtract_int(&V_frag6_nfragpackets, 1); 889 890 if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ 891 892 plen = 0; 893 for (t = m; t; t = t->m_next) 894 plen += t->m_len; 895 m->m_pkthdr.len = plen; 896 /* Set a valid receive interface pointer. */ 897 m->m_pkthdr.rcvif = srcifp; 898 } 899 900 #ifdef RSS 901 mtag = m_tag_alloc(MTAG_ABI_IPV6, IPV6_TAG_DIRECT, sizeof(*ip6dc), 902 M_NOWAIT); 903 if (mtag == NULL) 904 goto dropfrag; 905 906 ip6dc = (struct ip6_direct_ctx *)(mtag + 1); 907 ip6dc->ip6dc_nxt = nxt; 908 ip6dc->ip6dc_off = offset; 909 910 m_tag_prepend(m, mtag); 911 #endif 912 913 IP6QB_UNLOCK(bucket); 914 IP6STAT_INC(ip6s_reassembled); 915 in6_ifstat_inc(dstifp, ifs6_reass_ok); 916 917 #ifdef RSS 918 /* Queue/dispatch for reprocessing. */ 919 netisr_dispatch(NETISR_IPV6_DIRECT, m); 920 *mp = NULL; 921 return (IPPROTO_DONE); 922 #endif 923 924 /* Tell launch routine the next header. */ 925 *mp = m; 926 *offp = offset; 927 928 return (nxt); 929 930 dropfrag: 931 IP6QB_UNLOCK(bucket); 932 dropfrag2: 933 in6_ifstat_inc(dstifp, ifs6_reass_fail); 934 IP6STAT_INC(ip6s_fragdropped); 935 m_freem(m); 936 *mp = NULL; 937 return (IPPROTO_DONE); 938 } 939 940 /* 941 * IPv6 reassembling timer processing; 942 * if a timer expires on a reassembly queue, discard it. 943 */ 944 static struct callout frag6_callout; 945 static void 946 frag6_slowtimo(void *arg __unused) 947 { 948 VNET_ITERATOR_DECL(vnet_iter); 949 struct ip6qhead *head; 950 struct ip6q *q6, *q6tmp; 951 uint32_t bucket; 952 953 if (atomic_load_int(&frag6_nfrags) == 0) 954 goto done; 955 956 VNET_LIST_RLOCK_NOSLEEP(); 957 VNET_FOREACH(vnet_iter) { 958 CURVNET_SET(vnet_iter); 959 for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { 960 if (V_ip6qb[bucket].count == 0) 961 continue; 962 IP6QB_LOCK(bucket); 963 head = IP6QB_HEAD(bucket); 964 TAILQ_FOREACH_SAFE(q6, head, ip6q_tq, q6tmp) 965 if (--q6->ip6q_ttl == 0) { 966 IP6STAT_ADD(ip6s_fragtimeout, 967 q6->ip6q_nfrag); 968 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ 969 frag6_freef(q6, bucket); 970 } 971 /* 972 * If we are over the maximum number of fragments 973 * (due to the limit being lowered), drain off 974 * enough to get down to the new limit. 975 * Note that we drain all reassembly queues if 976 * maxfragpackets is 0 (fragmentation is disabled), 977 * and do not enforce a limit when maxfragpackets 978 * is negative. 979 */ 980 while ((V_ip6_maxfragpackets == 0 || 981 (V_ip6_maxfragpackets > 0 && 982 V_ip6qb[bucket].count > V_ip6_maxfragbucketsize)) && 983 (q6 = TAILQ_LAST(head, ip6qhead)) != NULL) { 984 IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); 985 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ 986 frag6_freef(q6, bucket); 987 } 988 IP6QB_UNLOCK(bucket); 989 } 990 /* 991 * If we are still over the maximum number of fragmented 992 * packets, drain off enough to get down to the new limit. 993 */ 994 bucket = 0; 995 while (V_ip6_maxfragpackets >= 0 && 996 atomic_load_int(&V_frag6_nfragpackets) > 997 (u_int)V_ip6_maxfragpackets) { 998 IP6QB_LOCK(bucket); 999 q6 = TAILQ_LAST(IP6QB_HEAD(bucket), ip6qhead); 1000 if (q6 != NULL) { 1001 IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); 1002 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ 1003 frag6_freef(q6, bucket); 1004 } 1005 IP6QB_UNLOCK(bucket); 1006 bucket = (bucket + 1) % IP6REASS_NHASH; 1007 } 1008 CURVNET_RESTORE(); 1009 } 1010 VNET_LIST_RUNLOCK_NOSLEEP(); 1011 done: 1012 callout_reset_sbt(&frag6_callout, SBT_1MS * IP6_CALLOUT_INTERVAL_MS, 1013 SBT_1MS * 10, frag6_slowtimo, NULL, 0); 1014 } 1015 1016 static void 1017 frag6_slowtimo_init(void *arg __unused) 1018 { 1019 1020 callout_init(&frag6_callout, 1); 1021 callout_reset_sbt(&frag6_callout, SBT_1MS * IP6_CALLOUT_INTERVAL_MS, 1022 SBT_1MS * 10, frag6_slowtimo, NULL, 0); 1023 } 1024 SYSINIT(frag6, SI_SUB_VNET_DONE, SI_ORDER_ANY, frag6_slowtimo_init, NULL); 1025 1026 /* 1027 * Eventhandler to adjust limits in case nmbclusters change. 1028 */ 1029 static void 1030 frag6_change(void *tag) 1031 { 1032 VNET_ITERATOR_DECL(vnet_iter); 1033 1034 ip6_maxfrags = IP6_MAXFRAGS; 1035 VNET_LIST_RLOCK_NOSLEEP(); 1036 VNET_FOREACH(vnet_iter) { 1037 CURVNET_SET(vnet_iter); 1038 V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; 1039 frag6_set_bucketsize(); 1040 CURVNET_RESTORE(); 1041 } 1042 VNET_LIST_RUNLOCK_NOSLEEP(); 1043 } 1044 1045 /* 1046 * Initialise reassembly queue and fragment identifier. 1047 */ 1048 void 1049 frag6_init(void) 1050 { 1051 uint32_t bucket; 1052 1053 V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; 1054 frag6_set_bucketsize(); 1055 for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { 1056 TAILQ_INIT(IP6QB_HEAD(bucket)); 1057 mtx_init(&V_ip6qb[bucket].lock, "ip6qb", NULL, MTX_DEF); 1058 V_ip6qb[bucket].count = 0; 1059 } 1060 V_ip6qb_hashseed = arc4random(); 1061 V_ip6_maxfragsperpacket = 64; 1062 #ifdef VIMAGE 1063 V_frag6_on = true; 1064 #endif 1065 if (!IS_DEFAULT_VNET(curvnet)) 1066 return; 1067 1068 ip6_maxfrags = IP6_MAXFRAGS; 1069 EVENTHANDLER_REGISTER(nmbclusters_change, 1070 frag6_change, NULL, EVENTHANDLER_PRI_ANY); 1071 } 1072 1073 /* 1074 * Drain off all datagram fragments. 1075 */ 1076 static void 1077 frag6_drain_one(void) 1078 { 1079 struct ip6q *q6; 1080 uint32_t bucket; 1081 1082 for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { 1083 IP6QB_LOCK(bucket); 1084 while ((q6 = TAILQ_FIRST(IP6QB_HEAD(bucket))) != NULL) { 1085 IP6STAT_INC(ip6s_fragdropped); 1086 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ 1087 frag6_freef(q6, bucket); 1088 } 1089 IP6QB_UNLOCK(bucket); 1090 } 1091 } 1092 1093 void 1094 frag6_drain(void) 1095 { 1096 VNET_ITERATOR_DECL(vnet_iter); 1097 1098 VNET_LIST_RLOCK_NOSLEEP(); 1099 VNET_FOREACH(vnet_iter) { 1100 CURVNET_SET(vnet_iter); 1101 frag6_drain_one(); 1102 CURVNET_RESTORE(); 1103 } 1104 VNET_LIST_RUNLOCK_NOSLEEP(); 1105 } 1106 1107 #ifdef VIMAGE 1108 /* 1109 * Clear up IPv6 reassembly structures. 1110 */ 1111 void 1112 frag6_destroy(void) 1113 { 1114 uint32_t bucket; 1115 1116 frag6_drain_one(); 1117 V_frag6_on = false; 1118 for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { 1119 KASSERT(V_ip6qb[bucket].count == 0, 1120 ("%s: V_ip6qb[%d] (%p) count not 0 (%d)", __func__, 1121 bucket, &V_ip6qb[bucket], V_ip6qb[bucket].count)); 1122 mtx_destroy(&V_ip6qb[bucket].lock); 1123 } 1124 } 1125 #endif 1126