1 /*- 2 * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org> 3 * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org> 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_rss.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/eventhandler.h> 42 #include <sys/kernel.h> 43 #include <sys/hash.h> 44 #include <sys/mbuf.h> 45 #include <sys/malloc.h> 46 #include <sys/limits.h> 47 #include <sys/lock.h> 48 #include <sys/mutex.h> 49 #include <sys/sysctl.h> 50 #include <sys/socket.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <net/rss_config.h> 55 #include <net/netisr.h> 56 #include <net/vnet.h> 57 58 #include <netinet/in.h> 59 #include <netinet/ip.h> 60 #include <netinet/ip_var.h> 61 #include <netinet/in_rss.h> 62 #ifdef MAC 63 #include <security/mac/mac_framework.h> 64 #endif 65 66 SYSCTL_DECL(_net_inet_ip); 67 68 /* 69 * Reassembly headers are stored in hash buckets. 70 */ 71 #define IPREASS_NHASH_LOG2 10 72 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 73 #define IPREASS_HMASK (V_ipq_hashsize - 1) 74 75 struct ipqbucket { 76 TAILQ_HEAD(ipqhead, ipq) head; 77 struct mtx lock; 78 struct callout timer; 79 #ifdef VIMAGE 80 struct vnet *vnet; 81 #endif 82 int count; 83 }; 84 85 VNET_DEFINE_STATIC(struct ipqbucket *, ipq); 86 #define V_ipq VNET(ipq) 87 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); 88 #define V_ipq_hashseed VNET(ipq_hashseed) 89 VNET_DEFINE_STATIC(uint32_t, ipq_hashsize); 90 #define V_ipq_hashsize VNET(ipq_hashsize) 91 92 #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) 93 #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) 94 #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) 95 #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) 96 #define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED) 97 98 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); 99 #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) 100 101 void ipreass_init(void); 102 void ipreass_vnet_init(void); 103 #ifdef VIMAGE 104 void ipreass_destroy(void); 105 #endif 106 static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); 107 static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); 108 static int sysctl_fragttl(SYSCTL_HANDLER_ARGS); 109 static void ipreass_zone_change(void *); 110 static void ipreass_drain_tomax(void); 111 static void ipq_free(struct ipqbucket *, struct ipq *); 112 static struct ipq * ipq_reuse(int); 113 static void ipreass_callout(void *); 114 static void ipreass_reschedule(struct ipqbucket *); 115 116 static inline void 117 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) 118 { 119 120 IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); 121 ipq_free(bucket, fp); 122 } 123 124 static inline void 125 ipq_drop(struct ipqbucket *bucket, struct ipq *fp) 126 { 127 128 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 129 ipq_free(bucket, fp); 130 ipreass_reschedule(bucket); 131 } 132 133 /* 134 * By default, limit the number of IP fragments across all reassembly 135 * queues to 1/32 of the total number of mbuf clusters. 136 * 137 * Limit the total number of reassembly queues per VNET to the 138 * IP fragment limit, but ensure the limit will not allow any bucket 139 * to grow above 100 items. (The bucket limit is 140 * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct 141 * multiplier to reach a 100-item limit.) 142 * The 100-item limit was chosen as brief testing seems to show that 143 * this produces "reasonable" performance on some subset of systems 144 * under DoS attack. 145 */ 146 #define IP_MAXFRAGS (nmbclusters / 32) 147 #define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, V_ipq_hashsize * 50)) 148 149 static int maxfrags; 150 static u_int __exclusive_cache_line nfrags; 151 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, 152 &maxfrags, 0, 153 "Maximum number of IPv4 fragments allowed across all reassembly queues"); 154 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, 155 &nfrags, 0, 156 "Current number of IPv4 fragments across all reassembly queues"); 157 158 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); 159 #define V_ipq_zone VNET(ipq_zone) 160 161 SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize, 162 CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0, 163 "Size of IP fragment reassembly hashtable"); 164 165 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, 166 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 167 NULL, 0, sysctl_maxfragpackets, "I", 168 "Maximum number of IPv4 fragment reassembly queue entries"); 169 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, 170 &VNET_NAME(ipq_zone), 171 "Current number of IPv4 fragment reassembly queue entries"); 172 173 VNET_DEFINE_STATIC(int, noreass); 174 #define V_noreass VNET(noreass) 175 176 VNET_DEFINE_STATIC(int, maxfragsperpacket); 177 #define V_maxfragsperpacket VNET(maxfragsperpacket) 178 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, 179 &VNET_NAME(maxfragsperpacket), 0, 180 "Maximum number of IPv4 fragments allowed per packet"); 181 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, 182 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, 183 sysctl_maxfragbucketsize, "I", 184 "Maximum number of IPv4 fragment reassembly queue entries per bucket"); 185 186 VNET_DEFINE_STATIC(u_int, ipfragttl) = 30; 187 #define V_ipfragttl VNET(ipfragttl) 188 SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW | 189 CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU", 190 "IP fragment life time on reassembly queue (seconds)"); 191 192 /* 193 * Take incoming datagram fragment and try to reassemble it into 194 * whole datagram. If the argument is the first fragment or one 195 * in between the function will return NULL and store the mbuf 196 * in the fragment chain. If the argument is the last fragment 197 * the packet will be reassembled and the pointer to the new 198 * mbuf returned for further processing. Only m_tags attached 199 * to the first packet/fragment are preserved. 200 * The IP header is *NOT* adjusted out of iplen. 201 */ 202 #define M_IP_FRAG M_PROTO9 203 struct mbuf * 204 ip_reass(struct mbuf *m) 205 { 206 struct ip *ip; 207 struct mbuf *p, *q, *nq, *t; 208 struct ipq *fp; 209 struct ifnet *srcifp; 210 struct ipqhead *head; 211 int i, hlen, next, tmpmax; 212 u_int8_t ecn, ecn0; 213 uint32_t hash, hashkey[3]; 214 #ifdef RSS 215 uint32_t rss_hash, rss_type; 216 #endif 217 218 /* 219 * If no reassembling or maxfragsperpacket are 0, 220 * never accept fragments. 221 * Also, drop packet if it would exceed the maximum 222 * number of fragments. 223 */ 224 tmpmax = maxfrags; 225 if (V_noreass == 1 || V_maxfragsperpacket == 0 || 226 (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { 227 IPSTAT_INC(ips_fragments); 228 IPSTAT_INC(ips_fragdropped); 229 m_freem(m); 230 return (NULL); 231 } 232 233 ip = mtod(m, struct ip *); 234 hlen = ip->ip_hl << 2; 235 236 /* 237 * Adjust ip_len to not reflect header, 238 * convert offset of this to bytes. 239 */ 240 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 241 /* 242 * Make sure that fragments have a data length 243 * that's a non-zero multiple of 8 bytes, unless 244 * this is the last fragment. 245 */ 246 if (ip->ip_len == htons(0) || 247 ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { 248 IPSTAT_INC(ips_toosmall); /* XXX */ 249 IPSTAT_INC(ips_fragdropped); 250 m_freem(m); 251 return (NULL); 252 } 253 if (ip->ip_off & htons(IP_MF)) 254 m->m_flags |= M_IP_FRAG; 255 else 256 m->m_flags &= ~M_IP_FRAG; 257 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 258 259 /* 260 * Make sure the fragment lies within a packet of valid size. 261 */ 262 if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) { 263 IPSTAT_INC(ips_toolong); 264 IPSTAT_INC(ips_fragdropped); 265 m_freem(m); 266 return (NULL); 267 } 268 269 /* 270 * Store receive network interface pointer for later. 271 */ 272 srcifp = m->m_pkthdr.rcvif; 273 274 /* 275 * Attempt reassembly; if it succeeds, proceed. 276 * ip_reass() will return a different mbuf. 277 */ 278 IPSTAT_INC(ips_fragments); 279 m->m_pkthdr.PH_loc.ptr = ip; 280 281 /* 282 * Presence of header sizes in mbufs 283 * would confuse code below. 284 */ 285 m->m_data += hlen; 286 m->m_len -= hlen; 287 288 hashkey[0] = ip->ip_src.s_addr; 289 hashkey[1] = ip->ip_dst.s_addr; 290 hashkey[2] = (uint32_t)ip->ip_p << 16; 291 hashkey[2] += ip->ip_id; 292 hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); 293 hash &= IPREASS_HMASK; 294 head = &V_ipq[hash].head; 295 IPQ_LOCK(hash); 296 297 /* 298 * Look for queue of fragments 299 * of this datagram. 300 */ 301 TAILQ_FOREACH(fp, head, ipq_list) 302 if (ip->ip_id == fp->ipq_id && 303 ip->ip_src.s_addr == fp->ipq_src.s_addr && 304 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 305 #ifdef MAC 306 mac_ipq_match(m, fp) && 307 #endif 308 ip->ip_p == fp->ipq_p) 309 break; 310 /* 311 * If first fragment to arrive, create a reassembly queue. 312 */ 313 if (fp == NULL) { 314 if (V_ipq[hash].count < V_ipreass_maxbucketsize) 315 fp = uma_zalloc(V_ipq_zone, M_NOWAIT); 316 if (fp == NULL) 317 fp = ipq_reuse(hash); 318 if (fp == NULL) 319 goto dropfrag; 320 #ifdef MAC 321 if (mac_ipq_init(fp, M_NOWAIT) != 0) { 322 uma_zfree(V_ipq_zone, fp); 323 fp = NULL; 324 goto dropfrag; 325 } 326 mac_ipq_create(m, fp); 327 #endif 328 TAILQ_INSERT_HEAD(head, fp, ipq_list); 329 V_ipq[hash].count++; 330 fp->ipq_nfrags = 1; 331 atomic_add_int(&nfrags, 1); 332 fp->ipq_expire = time_uptime + V_ipfragttl; 333 fp->ipq_p = ip->ip_p; 334 fp->ipq_id = ip->ip_id; 335 fp->ipq_src = ip->ip_src; 336 fp->ipq_dst = ip->ip_dst; 337 fp->ipq_frags = m; 338 if (m->m_flags & M_IP_FRAG) 339 fp->ipq_maxoff = -1; 340 else 341 fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); 342 m->m_nextpkt = NULL; 343 if (fp == TAILQ_LAST(head, ipqhead)) 344 callout_reset_sbt(&V_ipq[hash].timer, 345 SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout, 346 &V_ipq[hash], 0); 347 else 348 MPASS(callout_active(&V_ipq[hash].timer)); 349 goto done; 350 } else { 351 /* 352 * If we already saw the last fragment, make sure 353 * this fragment's offset looks sane. Otherwise, if 354 * this is the last fragment, record its endpoint. 355 */ 356 if (fp->ipq_maxoff > 0) { 357 i = ntohs(ip->ip_off) + ntohs(ip->ip_len); 358 if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || 359 ((m->m_flags & M_IP_FRAG) == 0 && 360 i != fp->ipq_maxoff)) { 361 fp = NULL; 362 goto dropfrag; 363 } 364 } else if ((m->m_flags & M_IP_FRAG) == 0) 365 fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); 366 fp->ipq_nfrags++; 367 atomic_add_int(&nfrags, 1); 368 #ifdef MAC 369 mac_ipq_update(m, fp); 370 #endif 371 } 372 373 #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) 374 375 /* 376 * Handle ECN by comparing this segment with the first one; 377 * if CE is set, do not lose CE. 378 * drop if CE and not-ECT are mixed for the same packet. 379 */ 380 ecn = ip->ip_tos & IPTOS_ECN_MASK; 381 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 382 if (ecn == IPTOS_ECN_CE) { 383 if (ecn0 == IPTOS_ECN_NOTECT) 384 goto dropfrag; 385 if (ecn0 != IPTOS_ECN_CE) 386 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 387 } 388 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 389 goto dropfrag; 390 391 /* 392 * Find a segment which begins after this one does. 393 */ 394 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 395 if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) 396 break; 397 398 /* 399 * If there is a preceding segment, it may provide some of 400 * our data already. If so, drop the data from the incoming 401 * segment. If it provides all of our data, drop us, otherwise 402 * stick new segment in the proper place. 403 * 404 * If some of the data is dropped from the preceding 405 * segment, then it's checksum is invalidated. 406 */ 407 if (p) { 408 i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - 409 ntohs(ip->ip_off); 410 if (i > 0) { 411 if (i >= ntohs(ip->ip_len)) 412 goto dropfrag; 413 m_adj(m, i); 414 m->m_pkthdr.csum_flags = 0; 415 ip->ip_off = htons(ntohs(ip->ip_off) + i); 416 ip->ip_len = htons(ntohs(ip->ip_len) - i); 417 } 418 m->m_nextpkt = p->m_nextpkt; 419 p->m_nextpkt = m; 420 } else { 421 m->m_nextpkt = fp->ipq_frags; 422 fp->ipq_frags = m; 423 } 424 425 /* 426 * While we overlap succeeding segments trim them or, 427 * if they are completely covered, dequeue them. 428 */ 429 for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > 430 ntohs(GETIP(q)->ip_off); q = nq) { 431 i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - 432 ntohs(GETIP(q)->ip_off); 433 if (i < ntohs(GETIP(q)->ip_len)) { 434 GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); 435 GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); 436 m_adj(q, i); 437 q->m_pkthdr.csum_flags = 0; 438 break; 439 } 440 nq = q->m_nextpkt; 441 m->m_nextpkt = nq; 442 IPSTAT_INC(ips_fragdropped); 443 fp->ipq_nfrags--; 444 atomic_subtract_int(&nfrags, 1); 445 m_freem(q); 446 } 447 448 /* 449 * Check for complete reassembly and perform frag per packet 450 * limiting. 451 * 452 * Frag limiting is performed here so that the nth frag has 453 * a chance to complete the packet before we drop the packet. 454 * As a result, n+1 frags are actually allowed per packet, but 455 * only n will ever be stored. (n = maxfragsperpacket.) 456 * 457 */ 458 next = 0; 459 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 460 if (ntohs(GETIP(q)->ip_off) != next) { 461 if (fp->ipq_nfrags > V_maxfragsperpacket) 462 ipq_drop(&V_ipq[hash], fp); 463 goto done; 464 } 465 next += ntohs(GETIP(q)->ip_len); 466 } 467 /* Make sure the last packet didn't have the IP_MF flag */ 468 if (p->m_flags & M_IP_FRAG) { 469 if (fp->ipq_nfrags > V_maxfragsperpacket) 470 ipq_drop(&V_ipq[hash], fp); 471 goto done; 472 } 473 474 /* 475 * Reassembly is complete. Make sure the packet is a sane size. 476 */ 477 q = fp->ipq_frags; 478 ip = GETIP(q); 479 if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { 480 IPSTAT_INC(ips_toolong); 481 ipq_drop(&V_ipq[hash], fp); 482 goto done; 483 } 484 485 /* 486 * Concatenate fragments. 487 */ 488 m = q; 489 t = m->m_next; 490 m->m_next = NULL; 491 m_cat(m, t); 492 nq = q->m_nextpkt; 493 q->m_nextpkt = NULL; 494 for (q = nq; q != NULL; q = nq) { 495 nq = q->m_nextpkt; 496 q->m_nextpkt = NULL; 497 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; 498 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; 499 m_demote_pkthdr(q); 500 m_cat(m, q); 501 } 502 /* 503 * In order to do checksumming faster we do 'end-around carry' here 504 * (and not in for{} loop), though it implies we are not going to 505 * reassemble more than 64k fragments. 506 */ 507 while (m->m_pkthdr.csum_data & 0xffff0000) 508 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + 509 (m->m_pkthdr.csum_data >> 16); 510 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 511 #ifdef MAC 512 mac_ipq_reassemble(fp, m); 513 mac_ipq_destroy(fp); 514 #endif 515 516 /* 517 * Create header for new ip packet by modifying header of first 518 * packet; dequeue and discard fragment reassembly header. 519 * Make header visible. 520 */ 521 ip->ip_len = htons((ip->ip_hl << 2) + next); 522 ip->ip_src = fp->ipq_src; 523 ip->ip_dst = fp->ipq_dst; 524 TAILQ_REMOVE(head, fp, ipq_list); 525 V_ipq[hash].count--; 526 uma_zfree(V_ipq_zone, fp); 527 m->m_len += (ip->ip_hl << 2); 528 m->m_data -= (ip->ip_hl << 2); 529 /* some debugging cruft by sklower, below, will go away soon */ 530 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ 531 m_fixhdr(m); 532 /* set valid receive interface pointer */ 533 m->m_pkthdr.rcvif = srcifp; 534 } 535 IPSTAT_INC(ips_reassembled); 536 ipreass_reschedule(&V_ipq[hash]); 537 IPQ_UNLOCK(hash); 538 539 #ifdef RSS 540 /* 541 * Query the RSS layer for the flowid / flowtype for the 542 * mbuf payload. 543 * 544 * For now, just assume we have to calculate a new one. 545 * Later on we should check to see if the assigned flowid matches 546 * what RSS wants for the given IP protocol and if so, just keep it. 547 * 548 * We then queue into the relevant netisr so it can be dispatched 549 * to the correct CPU. 550 * 551 * Note - this may return 1, which means the flowid in the mbuf 552 * is correct for the configured RSS hash types and can be used. 553 */ 554 if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { 555 m->m_pkthdr.flowid = rss_hash; 556 M_HASHTYPE_SET(m, rss_type); 557 } 558 559 /* 560 * Queue/dispatch for reprocessing. 561 * 562 * Note: this is much slower than just handling the frame in the 563 * current receive context. It's likely worth investigating 564 * why this is. 565 */ 566 netisr_dispatch(NETISR_IP_DIRECT, m); 567 return (NULL); 568 #endif 569 570 /* Handle in-line */ 571 return (m); 572 573 dropfrag: 574 IPSTAT_INC(ips_fragdropped); 575 if (fp != NULL) { 576 fp->ipq_nfrags--; 577 atomic_subtract_int(&nfrags, 1); 578 } 579 m_freem(m); 580 done: 581 IPQ_UNLOCK(hash); 582 return (NULL); 583 584 #undef GETIP 585 } 586 587 /* 588 * Timer expired on a bucket. 589 * There should be at least one ipq to be timed out. 590 */ 591 static void 592 ipreass_callout(void *arg) 593 { 594 struct ipqbucket *bucket = arg; 595 struct ipq *fp; 596 597 IPQ_BUCKET_LOCK_ASSERT(bucket); 598 MPASS(atomic_load_int(&nfrags) > 0); 599 600 CURVNET_SET(bucket->vnet); 601 fp = TAILQ_LAST(&bucket->head, ipqhead); 602 KASSERT(fp != NULL && fp->ipq_expire <= time_uptime, 603 ("%s: stray callout on bucket %p, %ju < %ju", __func__, bucket, 604 fp ? (uintmax_t)fp->ipq_expire : 0, (uintmax_t)time_uptime)); 605 606 while (fp != NULL && fp->ipq_expire <= time_uptime) { 607 ipq_timeout(bucket, fp); 608 fp = TAILQ_LAST(&bucket->head, ipqhead); 609 } 610 ipreass_reschedule(bucket); 611 CURVNET_RESTORE(); 612 } 613 614 static void 615 ipreass_reschedule(struct ipqbucket *bucket) 616 { 617 struct ipq *fp; 618 619 IPQ_BUCKET_LOCK_ASSERT(bucket); 620 621 if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) { 622 time_t t; 623 624 /* Protect against time_uptime tick. */ 625 t = fp->ipq_expire - time_uptime; 626 t = (t > 0) ? t : 1; 627 callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S, 628 ipreass_callout, bucket, 0); 629 } else 630 callout_stop(&bucket->timer); 631 } 632 633 static void 634 ipreass_drain_vnet(void) 635 { 636 u_int dropped = 0; 637 638 for (int i = 0; i < V_ipq_hashsize; i++) { 639 bool resched; 640 641 IPQ_LOCK(i); 642 resched = !TAILQ_EMPTY(&V_ipq[i].head); 643 while(!TAILQ_EMPTY(&V_ipq[i].head)) { 644 struct ipq *fp = TAILQ_FIRST(&V_ipq[i].head); 645 646 dropped += fp->ipq_nfrags; 647 ipq_free(&V_ipq[i], fp); 648 } 649 if (resched) 650 ipreass_reschedule(&V_ipq[i]); 651 KASSERT(V_ipq[i].count == 0, 652 ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, 653 V_ipq[i].count, V_ipq)); 654 IPQ_UNLOCK(i); 655 } 656 IPSTAT_ADD(ips_fragdropped, dropped); 657 } 658 659 /* 660 * Drain off all datagram fragments. 661 */ 662 static void 663 ipreass_drain(void) 664 { 665 VNET_ITERATOR_DECL(vnet_iter); 666 667 VNET_FOREACH(vnet_iter) { 668 CURVNET_SET(vnet_iter); 669 ipreass_drain_vnet(); 670 CURVNET_RESTORE(); 671 } 672 } 673 674 675 /* 676 * Initialize IP reassembly structures. 677 */ 678 MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers"); 679 void 680 ipreass_vnet_init(void) 681 { 682 int max; 683 684 V_ipq_hashsize = IPREASS_NHASH; 685 TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize); 686 V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize, 687 M_IPREASS_HASH, M_WAITOK); 688 689 for (int i = 0; i < V_ipq_hashsize; i++) { 690 TAILQ_INIT(&V_ipq[i].head); 691 mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, 692 MTX_DEF | MTX_DUPOK | MTX_NEW); 693 callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0); 694 V_ipq[i].count = 0; 695 #ifdef VIMAGE 696 V_ipq[i].vnet = curvnet; 697 #endif 698 } 699 V_ipq_hashseed = arc4random(); 700 V_maxfragsperpacket = 16; 701 V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, 702 NULL, UMA_ALIGN_PTR, 0); 703 max = IP_MAXFRAGPACKETS; 704 max = uma_zone_set_max(V_ipq_zone, max); 705 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 706 } 707 708 void 709 ipreass_init(void) 710 { 711 712 maxfrags = IP_MAXFRAGS; 713 EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, 714 NULL, EVENTHANDLER_PRI_ANY); 715 EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL, 716 LOWMEM_PRI_DEFAULT); 717 EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL, 718 LOWMEM_PRI_DEFAULT); 719 } 720 721 /* 722 * Drain off all datagram fragments belonging to 723 * the given network interface. 724 */ 725 static void 726 ipreass_cleanup(void *arg __unused, struct ifnet *ifp) 727 { 728 struct ipq *fp, *temp; 729 struct mbuf *m; 730 int i; 731 732 KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); 733 734 CURVNET_SET_QUIET(ifp->if_vnet); 735 736 /* 737 * Skip processing if IPv4 reassembly is not initialised or 738 * torn down by ipreass_destroy(). 739 */ 740 if (V_ipq_zone == NULL) { 741 CURVNET_RESTORE(); 742 return; 743 } 744 745 for (i = 0; i < V_ipq_hashsize; i++) { 746 IPQ_LOCK(i); 747 /* Scan fragment list. */ 748 TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { 749 for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { 750 /* clear no longer valid rcvif pointer */ 751 if (m->m_pkthdr.rcvif == ifp) 752 m->m_pkthdr.rcvif = NULL; 753 } 754 } 755 IPQ_UNLOCK(i); 756 } 757 CURVNET_RESTORE(); 758 } 759 EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); 760 761 #ifdef VIMAGE 762 /* 763 * Destroy IP reassembly structures. 764 */ 765 void 766 ipreass_destroy(void) 767 { 768 769 ipreass_drain_vnet(); 770 uma_zdestroy(V_ipq_zone); 771 V_ipq_zone = NULL; 772 for (int i = 0; i < V_ipq_hashsize; i++) 773 mtx_destroy(&V_ipq[i].lock); 774 free(V_ipq, M_IPREASS_HASH); 775 } 776 #endif 777 778 /* 779 * After maxnipq has been updated, propagate the change to UMA. The UMA zone 780 * max has slightly different semantics than the sysctl, for historical 781 * reasons. 782 */ 783 static void 784 ipreass_drain_tomax(void) 785 { 786 struct ipq *fp; 787 int target; 788 789 /* 790 * Make sure each bucket is under the new limit. If 791 * necessary, drop enough of the oldest elements from 792 * each bucket to get under the new limit. 793 */ 794 for (int i = 0; i < V_ipq_hashsize; i++) { 795 IPQ_LOCK(i); 796 while (V_ipq[i].count > V_ipreass_maxbucketsize && 797 (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) 798 ipq_timeout(&V_ipq[i], fp); 799 ipreass_reschedule(&V_ipq[i]); 800 IPQ_UNLOCK(i); 801 } 802 803 /* 804 * If we are over the maximum number of fragments, 805 * drain off enough to get down to the new limit, 806 * stripping off last elements on queues. Every 807 * run we strip the oldest element from each bucket. 808 */ 809 target = uma_zone_get_max(V_ipq_zone); 810 while (uma_zone_get_cur(V_ipq_zone) > target) { 811 for (int i = 0; i < V_ipq_hashsize; i++) { 812 IPQ_LOCK(i); 813 fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); 814 if (fp != NULL) { 815 ipq_timeout(&V_ipq[i], fp); 816 ipreass_reschedule(&V_ipq[i]); 817 } 818 IPQ_UNLOCK(i); 819 } 820 } 821 } 822 823 static void 824 ipreass_zone_change(void *tag) 825 { 826 VNET_ITERATOR_DECL(vnet_iter); 827 int max; 828 829 maxfrags = IP_MAXFRAGS; 830 max = IP_MAXFRAGPACKETS; 831 VNET_LIST_RLOCK_NOSLEEP(); 832 VNET_FOREACH(vnet_iter) { 833 CURVNET_SET(vnet_iter); 834 max = uma_zone_set_max(V_ipq_zone, max); 835 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 836 ipreass_drain_tomax(); 837 CURVNET_RESTORE(); 838 } 839 VNET_LIST_RUNLOCK_NOSLEEP(); 840 } 841 842 /* 843 * Change the limit on the UMA zone, or disable the fragment allocation 844 * at all. Since 0 and -1 is a special values here, we need our own handler, 845 * instead of sysctl_handle_uma_zone_max(). 846 */ 847 static int 848 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) 849 { 850 int error, max; 851 852 if (V_noreass == 0) { 853 max = uma_zone_get_max(V_ipq_zone); 854 if (max == 0) 855 max = -1; 856 } else 857 max = 0; 858 error = sysctl_handle_int(oidp, &max, 0, req); 859 if (error || !req->newptr) 860 return (error); 861 if (max > 0) { 862 /* 863 * XXXRW: Might be a good idea to sanity check the argument 864 * and place an extreme upper bound. 865 */ 866 max = uma_zone_set_max(V_ipq_zone, max); 867 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 868 ipreass_drain_tomax(); 869 V_noreass = 0; 870 } else if (max == 0) { 871 V_noreass = 1; 872 ipreass_drain(); 873 } else if (max == -1) { 874 V_noreass = 0; 875 uma_zone_set_max(V_ipq_zone, 0); 876 V_ipreass_maxbucketsize = INT_MAX; 877 } else 878 return (EINVAL); 879 return (0); 880 } 881 882 /* 883 * Seek for old fragment queue header that can be reused. Try to 884 * reuse a header from currently locked hash bucket. 885 */ 886 static struct ipq * 887 ipq_reuse(int start) 888 { 889 struct ipq *fp; 890 int bucket, i; 891 892 IPQ_LOCK_ASSERT(start); 893 894 for (i = 0; i < V_ipq_hashsize; i++) { 895 bucket = (start + i) % V_ipq_hashsize; 896 if (bucket != start && IPQ_TRYLOCK(bucket) == 0) 897 continue; 898 fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); 899 if (fp) { 900 struct mbuf *m; 901 902 IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); 903 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 904 while (fp->ipq_frags) { 905 m = fp->ipq_frags; 906 fp->ipq_frags = m->m_nextpkt; 907 m_freem(m); 908 } 909 TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); 910 V_ipq[bucket].count--; 911 ipreass_reschedule(&V_ipq[bucket]); 912 if (bucket != start) 913 IPQ_UNLOCK(bucket); 914 break; 915 } 916 if (bucket != start) 917 IPQ_UNLOCK(bucket); 918 } 919 IPQ_LOCK_ASSERT(start); 920 return (fp); 921 } 922 923 /* 924 * Free a fragment reassembly header and all associated datagrams. 925 */ 926 static void 927 ipq_free(struct ipqbucket *bucket, struct ipq *fp) 928 { 929 struct mbuf *q; 930 931 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 932 while (fp->ipq_frags) { 933 q = fp->ipq_frags; 934 fp->ipq_frags = q->m_nextpkt; 935 m_freem(q); 936 } 937 TAILQ_REMOVE(&bucket->head, fp, ipq_list); 938 bucket->count--; 939 uma_zfree(V_ipq_zone, fp); 940 } 941 942 /* 943 * Get or set the maximum number of reassembly queues per bucket. 944 */ 945 static int 946 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) 947 { 948 int error, max; 949 950 max = V_ipreass_maxbucketsize; 951 error = sysctl_handle_int(oidp, &max, 0, req); 952 if (error || !req->newptr) 953 return (error); 954 if (max <= 0) 955 return (EINVAL); 956 V_ipreass_maxbucketsize = max; 957 ipreass_drain_tomax(); 958 return (0); 959 } 960 961 /* 962 * Get or set the IP fragment time to live. 963 */ 964 static int 965 sysctl_fragttl(SYSCTL_HANDLER_ARGS) 966 { 967 u_int ttl; 968 int error; 969 970 ttl = V_ipfragttl; 971 error = sysctl_handle_int(oidp, &ttl, 0, req); 972 if (error || !req->newptr) 973 return (error); 974 975 if (ttl < 1 || ttl > MAXTTL) 976 return (EINVAL); 977 978 atomic_store_int(&V_ipfragttl, ttl); 979 return (0); 980 } 981