1 /*- 2 * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org> 3 * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org> 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include "opt_rss.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/eventhandler.h> 38 #include <sys/kernel.h> 39 #include <sys/hash.h> 40 #include <sys/mbuf.h> 41 #include <sys/malloc.h> 42 #include <sys/limits.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 48 #include <net/if.h> 49 #include <net/if_var.h> 50 #include <net/if_private.h> 51 #include <net/rss_config.h> 52 #include <net/netisr.h> 53 #include <net/vnet.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip_var.h> 58 #include <netinet/in_rss.h> 59 #ifdef MAC 60 #include <security/mac/mac_framework.h> 61 #endif 62 63 SYSCTL_DECL(_net_inet_ip); 64 65 /* 66 * Reassembly headers are stored in hash buckets. 67 */ 68 #define IPREASS_NHASH_LOG2 10 69 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 70 #define IPREASS_HMASK (V_ipq_hashsize - 1) 71 72 struct ipqbucket { 73 TAILQ_HEAD(ipqhead, ipq) head; 74 struct mtx lock; 75 struct callout timer; 76 #ifdef VIMAGE 77 struct vnet *vnet; 78 #endif 79 int count; 80 }; 81 82 VNET_DEFINE_STATIC(struct ipqbucket *, ipq); 83 #define V_ipq VNET(ipq) 84 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); 85 #define V_ipq_hashseed VNET(ipq_hashseed) 86 VNET_DEFINE_STATIC(uint32_t, ipq_hashsize); 87 #define V_ipq_hashsize VNET(ipq_hashsize) 88 89 #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) 90 #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) 91 #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) 92 #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) 93 #define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED) 94 95 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); 96 #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) 97 98 void ipreass_init(void); 99 void ipreass_vnet_init(void); 100 #ifdef VIMAGE 101 void ipreass_destroy(void); 102 #endif 103 static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); 104 static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); 105 static int sysctl_fragttl(SYSCTL_HANDLER_ARGS); 106 static void ipreass_zone_change(void *); 107 static void ipreass_drain_tomax(void); 108 static void ipq_free(struct ipqbucket *, struct ipq *); 109 static struct ipq * ipq_reuse(int); 110 static void ipreass_callout(void *); 111 static void ipreass_reschedule(struct ipqbucket *); 112 113 static inline void 114 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) 115 { 116 117 IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); 118 ipq_free(bucket, fp); 119 } 120 121 static inline void 122 ipq_drop(struct ipqbucket *bucket, struct ipq *fp) 123 { 124 125 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 126 ipq_free(bucket, fp); 127 ipreass_reschedule(bucket); 128 } 129 130 /* 131 * By default, limit the number of IP fragments across all reassembly 132 * queues to 1/32 of the total number of mbuf clusters. 133 * 134 * Limit the total number of reassembly queues per VNET to the 135 * IP fragment limit, but ensure the limit will not allow any bucket 136 * to grow above 100 items. (The bucket limit is 137 * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct 138 * multiplier to reach a 100-item limit.) 139 * The 100-item limit was chosen as brief testing seems to show that 140 * this produces "reasonable" performance on some subset of systems 141 * under DoS attack. 142 */ 143 #define IP_MAXFRAGS (nmbclusters / 32) 144 #define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, V_ipq_hashsize * 50)) 145 146 static int maxfrags; 147 static u_int __exclusive_cache_line nfrags; 148 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, 149 &maxfrags, 0, 150 "Maximum number of IPv4 fragments allowed across all reassembly queues"); 151 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, 152 &nfrags, 0, 153 "Current number of IPv4 fragments across all reassembly queues"); 154 155 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); 156 #define V_ipq_zone VNET(ipq_zone) 157 158 SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize, 159 CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0, 160 "Size of IP fragment reassembly hashtable"); 161 162 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, 163 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 164 NULL, 0, sysctl_maxfragpackets, "I", 165 "Maximum number of IPv4 fragment reassembly queue entries"); 166 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, 167 &VNET_NAME(ipq_zone), 168 "Current number of IPv4 fragment reassembly queue entries"); 169 170 VNET_DEFINE_STATIC(int, noreass); 171 #define V_noreass VNET(noreass) 172 173 VNET_DEFINE_STATIC(int, maxfragsperpacket); 174 #define V_maxfragsperpacket VNET(maxfragsperpacket) 175 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, 176 &VNET_NAME(maxfragsperpacket), 0, 177 "Maximum number of IPv4 fragments allowed per packet"); 178 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, 179 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, 180 sysctl_maxfragbucketsize, "I", 181 "Maximum number of IPv4 fragment reassembly queue entries per bucket"); 182 183 VNET_DEFINE_STATIC(u_int, ipfragttl) = 30; 184 #define V_ipfragttl VNET(ipfragttl) 185 SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW | 186 CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU", 187 "IP fragment life time on reassembly queue (seconds)"); 188 189 /* 190 * Take incoming datagram fragment and try to reassemble it into 191 * whole datagram. If the argument is the first fragment or one 192 * in between the function will return NULL and store the mbuf 193 * in the fragment chain. If the argument is the last fragment 194 * the packet will be reassembled and the pointer to the new 195 * mbuf returned for further processing. Only m_tags attached 196 * to the first packet/fragment are preserved. 197 * The IP header is *NOT* adjusted out of iplen. 198 */ 199 #define M_IP_FRAG M_PROTO9 200 struct mbuf * 201 ip_reass(struct mbuf *m) 202 { 203 struct ip *ip; 204 struct mbuf *p, *q, *nq, *t; 205 struct ipq *fp; 206 struct ifnet *srcifp; 207 struct ipqhead *head; 208 int i, hlen, next, tmpmax; 209 u_int8_t ecn, ecn0; 210 uint32_t hash, hashkey[3]; 211 #ifdef RSS 212 uint32_t rss_hash, rss_type; 213 #endif 214 215 /* 216 * If no reassembling or maxfragsperpacket are 0, 217 * never accept fragments. 218 * Also, drop packet if it would exceed the maximum 219 * number of fragments. 220 */ 221 tmpmax = maxfrags; 222 if (V_noreass == 1 || V_maxfragsperpacket == 0 || 223 (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { 224 IPSTAT_INC(ips_fragments); 225 IPSTAT_INC(ips_fragdropped); 226 m_freem(m); 227 return (NULL); 228 } 229 230 ip = mtod(m, struct ip *); 231 hlen = ip->ip_hl << 2; 232 233 /* 234 * Adjust ip_len to not reflect header, 235 * convert offset of this to bytes. 236 */ 237 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 238 /* 239 * Make sure that fragments have a data length 240 * that's a non-zero multiple of 8 bytes, unless 241 * this is the last fragment. 242 */ 243 if (ip->ip_len == htons(0) || 244 ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { 245 IPSTAT_INC(ips_toosmall); /* XXX */ 246 IPSTAT_INC(ips_fragdropped); 247 m_freem(m); 248 return (NULL); 249 } 250 if (ip->ip_off & htons(IP_MF)) 251 m->m_flags |= M_IP_FRAG; 252 else 253 m->m_flags &= ~M_IP_FRAG; 254 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 255 256 /* 257 * Make sure the fragment lies within a packet of valid size. 258 */ 259 if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) { 260 IPSTAT_INC(ips_toolong); 261 IPSTAT_INC(ips_fragdropped); 262 m_freem(m); 263 return (NULL); 264 } 265 266 /* 267 * Store receive network interface pointer for later. 268 */ 269 srcifp = m->m_pkthdr.rcvif; 270 271 /* 272 * Attempt reassembly; if it succeeds, proceed. 273 * ip_reass() will return a different mbuf. 274 */ 275 IPSTAT_INC(ips_fragments); 276 m->m_pkthdr.PH_loc.ptr = ip; 277 278 /* 279 * Presence of header sizes in mbufs 280 * would confuse code below. 281 */ 282 m->m_data += hlen; 283 m->m_len -= hlen; 284 285 hashkey[0] = ip->ip_src.s_addr; 286 hashkey[1] = ip->ip_dst.s_addr; 287 hashkey[2] = (uint32_t)ip->ip_p << 16; 288 hashkey[2] += ip->ip_id; 289 hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); 290 hash &= IPREASS_HMASK; 291 head = &V_ipq[hash].head; 292 IPQ_LOCK(hash); 293 294 /* 295 * Look for queue of fragments 296 * of this datagram. 297 */ 298 TAILQ_FOREACH(fp, head, ipq_list) 299 if (ip->ip_id == fp->ipq_id && 300 ip->ip_src.s_addr == fp->ipq_src.s_addr && 301 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 302 #ifdef MAC 303 mac_ipq_match(m, fp) && 304 #endif 305 ip->ip_p == fp->ipq_p) 306 break; 307 /* 308 * If first fragment to arrive, create a reassembly queue. 309 */ 310 if (fp == NULL) { 311 if (V_ipq[hash].count < V_ipreass_maxbucketsize) 312 fp = uma_zalloc(V_ipq_zone, M_NOWAIT); 313 if (fp == NULL) 314 fp = ipq_reuse(hash); 315 if (fp == NULL) 316 goto dropfrag; 317 #ifdef MAC 318 if (mac_ipq_init(fp, M_NOWAIT) != 0) { 319 uma_zfree(V_ipq_zone, fp); 320 fp = NULL; 321 goto dropfrag; 322 } 323 mac_ipq_create(m, fp); 324 #endif 325 TAILQ_INSERT_HEAD(head, fp, ipq_list); 326 V_ipq[hash].count++; 327 fp->ipq_nfrags = 1; 328 atomic_add_int(&nfrags, 1); 329 fp->ipq_expire = time_uptime + V_ipfragttl; 330 fp->ipq_p = ip->ip_p; 331 fp->ipq_id = ip->ip_id; 332 fp->ipq_src = ip->ip_src; 333 fp->ipq_dst = ip->ip_dst; 334 fp->ipq_frags = m; 335 if (m->m_flags & M_IP_FRAG) 336 fp->ipq_maxoff = -1; 337 else 338 fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); 339 m->m_nextpkt = NULL; 340 if (fp == TAILQ_LAST(head, ipqhead)) 341 callout_reset_sbt(&V_ipq[hash].timer, 342 SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout, 343 &V_ipq[hash], 0); 344 else 345 MPASS(callout_active(&V_ipq[hash].timer)); 346 goto done; 347 } else { 348 /* 349 * If we already saw the last fragment, make sure 350 * this fragment's offset looks sane. Otherwise, if 351 * this is the last fragment, record its endpoint. 352 */ 353 if (fp->ipq_maxoff > 0) { 354 i = ntohs(ip->ip_off) + ntohs(ip->ip_len); 355 if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || 356 ((m->m_flags & M_IP_FRAG) == 0 && 357 i != fp->ipq_maxoff)) { 358 fp = NULL; 359 goto dropfrag; 360 } 361 } else if ((m->m_flags & M_IP_FRAG) == 0) 362 fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); 363 fp->ipq_nfrags++; 364 atomic_add_int(&nfrags, 1); 365 #ifdef MAC 366 mac_ipq_update(m, fp); 367 #endif 368 } 369 370 #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) 371 372 /* 373 * Handle ECN by comparing this segment with the first one; 374 * if CE is set, do not lose CE. 375 * drop if CE and not-ECT are mixed for the same packet. 376 */ 377 ecn = ip->ip_tos & IPTOS_ECN_MASK; 378 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 379 if (ecn == IPTOS_ECN_CE) { 380 if (ecn0 == IPTOS_ECN_NOTECT) 381 goto dropfrag; 382 if (ecn0 != IPTOS_ECN_CE) 383 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 384 } 385 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 386 goto dropfrag; 387 388 /* 389 * Find a segment which begins after this one does. 390 */ 391 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 392 if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) 393 break; 394 395 /* 396 * If there is a preceding segment, it may provide some of 397 * our data already. If so, drop the data from the incoming 398 * segment. If it provides all of our data, drop us, otherwise 399 * stick new segment in the proper place. 400 * 401 * If some of the data is dropped from the preceding 402 * segment, then it's checksum is invalidated. 403 */ 404 if (p) { 405 i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - 406 ntohs(ip->ip_off); 407 if (i > 0) { 408 if (i >= ntohs(ip->ip_len)) 409 goto dropfrag; 410 m_adj(m, i); 411 m->m_pkthdr.csum_flags = 0; 412 ip->ip_off = htons(ntohs(ip->ip_off) + i); 413 ip->ip_len = htons(ntohs(ip->ip_len) - i); 414 } 415 m->m_nextpkt = p->m_nextpkt; 416 p->m_nextpkt = m; 417 } else { 418 m->m_nextpkt = fp->ipq_frags; 419 fp->ipq_frags = m; 420 } 421 422 /* 423 * While we overlap succeeding segments trim them or, 424 * if they are completely covered, dequeue them. 425 */ 426 for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > 427 ntohs(GETIP(q)->ip_off); q = nq) { 428 i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - 429 ntohs(GETIP(q)->ip_off); 430 if (i < ntohs(GETIP(q)->ip_len)) { 431 GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); 432 GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); 433 m_adj(q, i); 434 q->m_pkthdr.csum_flags = 0; 435 break; 436 } 437 nq = q->m_nextpkt; 438 m->m_nextpkt = nq; 439 IPSTAT_INC(ips_fragdropped); 440 fp->ipq_nfrags--; 441 atomic_subtract_int(&nfrags, 1); 442 m_freem(q); 443 } 444 445 /* 446 * Check for complete reassembly and perform frag per packet 447 * limiting. 448 * 449 * Frag limiting is performed here so that the nth frag has 450 * a chance to complete the packet before we drop the packet. 451 * As a result, n+1 frags are actually allowed per packet, but 452 * only n will ever be stored. (n = maxfragsperpacket.) 453 * 454 */ 455 next = 0; 456 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 457 if (ntohs(GETIP(q)->ip_off) != next) { 458 if (fp->ipq_nfrags > V_maxfragsperpacket) 459 ipq_drop(&V_ipq[hash], fp); 460 goto done; 461 } 462 next += ntohs(GETIP(q)->ip_len); 463 } 464 /* Make sure the last packet didn't have the IP_MF flag */ 465 if (p->m_flags & M_IP_FRAG) { 466 if (fp->ipq_nfrags > V_maxfragsperpacket) 467 ipq_drop(&V_ipq[hash], fp); 468 goto done; 469 } 470 471 /* 472 * Reassembly is complete. Make sure the packet is a sane size. 473 */ 474 q = fp->ipq_frags; 475 ip = GETIP(q); 476 if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { 477 IPSTAT_INC(ips_toolong); 478 ipq_drop(&V_ipq[hash], fp); 479 goto done; 480 } 481 482 /* 483 * Concatenate fragments. 484 */ 485 m = q; 486 t = m->m_next; 487 m->m_next = NULL; 488 m_cat(m, t); 489 nq = q->m_nextpkt; 490 q->m_nextpkt = NULL; 491 for (q = nq; q != NULL; q = nq) { 492 nq = q->m_nextpkt; 493 q->m_nextpkt = NULL; 494 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; 495 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; 496 m_demote_pkthdr(q); 497 m_cat(m, q); 498 } 499 /* 500 * In order to do checksumming faster we do 'end-around carry' here 501 * (and not in for{} loop), though it implies we are not going to 502 * reassemble more than 64k fragments. 503 */ 504 while (m->m_pkthdr.csum_data & 0xffff0000) 505 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + 506 (m->m_pkthdr.csum_data >> 16); 507 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 508 #ifdef MAC 509 mac_ipq_reassemble(fp, m); 510 mac_ipq_destroy(fp); 511 #endif 512 513 /* 514 * Create header for new ip packet by modifying header of first 515 * packet; dequeue and discard fragment reassembly header. 516 * Make header visible. 517 */ 518 ip->ip_len = htons((ip->ip_hl << 2) + next); 519 ip->ip_src = fp->ipq_src; 520 ip->ip_dst = fp->ipq_dst; 521 TAILQ_REMOVE(head, fp, ipq_list); 522 V_ipq[hash].count--; 523 uma_zfree(V_ipq_zone, fp); 524 m->m_len += (ip->ip_hl << 2); 525 m->m_data -= (ip->ip_hl << 2); 526 /* some debugging cruft by sklower, below, will go away soon */ 527 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ 528 m_fixhdr(m); 529 /* set valid receive interface pointer */ 530 m->m_pkthdr.rcvif = srcifp; 531 } 532 IPSTAT_INC(ips_reassembled); 533 ipreass_reschedule(&V_ipq[hash]); 534 IPQ_UNLOCK(hash); 535 536 #ifdef RSS 537 /* 538 * Query the RSS layer for the flowid / flowtype for the 539 * mbuf payload. 540 * 541 * For now, just assume we have to calculate a new one. 542 * Later on we should check to see if the assigned flowid matches 543 * what RSS wants for the given IP protocol and if so, just keep it. 544 * 545 * We then queue into the relevant netisr so it can be dispatched 546 * to the correct CPU. 547 * 548 * Note - this may return 1, which means the flowid in the mbuf 549 * is correct for the configured RSS hash types and can be used. 550 */ 551 if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { 552 m->m_pkthdr.flowid = rss_hash; 553 M_HASHTYPE_SET(m, rss_type); 554 } 555 556 /* 557 * Queue/dispatch for reprocessing. 558 * 559 * Note: this is much slower than just handling the frame in the 560 * current receive context. It's likely worth investigating 561 * why this is. 562 */ 563 netisr_dispatch(NETISR_IP_DIRECT, m); 564 return (NULL); 565 #endif 566 567 /* Handle in-line */ 568 return (m); 569 570 dropfrag: 571 IPSTAT_INC(ips_fragdropped); 572 if (fp != NULL) { 573 fp->ipq_nfrags--; 574 atomic_subtract_int(&nfrags, 1); 575 } 576 m_freem(m); 577 done: 578 IPQ_UNLOCK(hash); 579 return (NULL); 580 581 #undef GETIP 582 } 583 584 /* 585 * Timer expired on a bucket. 586 * There should be at least one ipq to be timed out. 587 */ 588 static void 589 ipreass_callout(void *arg) 590 { 591 struct ipqbucket *bucket = arg; 592 struct ipq *fp; 593 594 IPQ_BUCKET_LOCK_ASSERT(bucket); 595 MPASS(atomic_load_int(&nfrags) > 0); 596 597 CURVNET_SET(bucket->vnet); 598 fp = TAILQ_LAST(&bucket->head, ipqhead); 599 KASSERT(fp != NULL && fp->ipq_expire <= time_uptime, 600 ("%s: stray callout on bucket %p, %ju < %ju", __func__, bucket, 601 fp ? (uintmax_t)fp->ipq_expire : 0, (uintmax_t)time_uptime)); 602 603 while (fp != NULL && fp->ipq_expire <= time_uptime) { 604 ipq_timeout(bucket, fp); 605 fp = TAILQ_LAST(&bucket->head, ipqhead); 606 } 607 ipreass_reschedule(bucket); 608 CURVNET_RESTORE(); 609 } 610 611 static void 612 ipreass_reschedule(struct ipqbucket *bucket) 613 { 614 struct ipq *fp; 615 616 IPQ_BUCKET_LOCK_ASSERT(bucket); 617 618 if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) { 619 time_t t; 620 621 /* Protect against time_uptime tick. */ 622 t = fp->ipq_expire - time_uptime; 623 t = (t > 0) ? t : 1; 624 callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S, 625 ipreass_callout, bucket, 0); 626 } else 627 callout_stop(&bucket->timer); 628 } 629 630 static void 631 ipreass_drain_vnet(void) 632 { 633 u_int dropped = 0; 634 635 for (int i = 0; i < V_ipq_hashsize; i++) { 636 bool resched; 637 638 IPQ_LOCK(i); 639 resched = !TAILQ_EMPTY(&V_ipq[i].head); 640 while(!TAILQ_EMPTY(&V_ipq[i].head)) { 641 struct ipq *fp = TAILQ_FIRST(&V_ipq[i].head); 642 643 dropped += fp->ipq_nfrags; 644 ipq_free(&V_ipq[i], fp); 645 } 646 if (resched) 647 ipreass_reschedule(&V_ipq[i]); 648 KASSERT(V_ipq[i].count == 0, 649 ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, 650 V_ipq[i].count, V_ipq)); 651 IPQ_UNLOCK(i); 652 } 653 IPSTAT_ADD(ips_fragdropped, dropped); 654 } 655 656 /* 657 * Drain off all datagram fragments. 658 */ 659 static void 660 ipreass_drain(void) 661 { 662 VNET_ITERATOR_DECL(vnet_iter); 663 664 VNET_FOREACH(vnet_iter) { 665 CURVNET_SET(vnet_iter); 666 ipreass_drain_vnet(); 667 CURVNET_RESTORE(); 668 } 669 } 670 671 672 /* 673 * Initialize IP reassembly structures. 674 */ 675 MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers"); 676 void 677 ipreass_vnet_init(void) 678 { 679 int max; 680 681 V_ipq_hashsize = IPREASS_NHASH; 682 TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize); 683 V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize, 684 M_IPREASS_HASH, M_WAITOK); 685 686 for (int i = 0; i < V_ipq_hashsize; i++) { 687 TAILQ_INIT(&V_ipq[i].head); 688 mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, 689 MTX_DEF | MTX_DUPOK | MTX_NEW); 690 callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0); 691 V_ipq[i].count = 0; 692 #ifdef VIMAGE 693 V_ipq[i].vnet = curvnet; 694 #endif 695 } 696 V_ipq_hashseed = arc4random(); 697 V_maxfragsperpacket = 16; 698 V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, 699 NULL, UMA_ALIGN_PTR, 0); 700 max = IP_MAXFRAGPACKETS; 701 max = uma_zone_set_max(V_ipq_zone, max); 702 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 703 } 704 705 void 706 ipreass_init(void) 707 { 708 709 maxfrags = IP_MAXFRAGS; 710 EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, 711 NULL, EVENTHANDLER_PRI_ANY); 712 EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL, 713 LOWMEM_PRI_DEFAULT); 714 EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL, 715 LOWMEM_PRI_DEFAULT); 716 } 717 718 /* 719 * Drain off all datagram fragments belonging to 720 * the given network interface. 721 */ 722 static void 723 ipreass_cleanup(void *arg __unused, struct ifnet *ifp) 724 { 725 struct ipq *fp, *temp; 726 struct mbuf *m; 727 int i; 728 729 KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); 730 731 CURVNET_SET_QUIET(ifp->if_vnet); 732 733 /* 734 * Skip processing if IPv4 reassembly is not initialised or 735 * torn down by ipreass_destroy(). 736 */ 737 if (V_ipq_zone == NULL) { 738 CURVNET_RESTORE(); 739 return; 740 } 741 742 for (i = 0; i < V_ipq_hashsize; i++) { 743 IPQ_LOCK(i); 744 /* Scan fragment list. */ 745 TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { 746 for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { 747 /* clear no longer valid rcvif pointer */ 748 if (m->m_pkthdr.rcvif == ifp) 749 m->m_pkthdr.rcvif = NULL; 750 } 751 } 752 IPQ_UNLOCK(i); 753 } 754 CURVNET_RESTORE(); 755 } 756 EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); 757 758 #ifdef VIMAGE 759 /* 760 * Destroy IP reassembly structures. 761 */ 762 void 763 ipreass_destroy(void) 764 { 765 766 ipreass_drain_vnet(); 767 uma_zdestroy(V_ipq_zone); 768 V_ipq_zone = NULL; 769 for (int i = 0; i < V_ipq_hashsize; i++) 770 mtx_destroy(&V_ipq[i].lock); 771 free(V_ipq, M_IPREASS_HASH); 772 } 773 #endif 774 775 /* 776 * After maxnipq has been updated, propagate the change to UMA. The UMA zone 777 * max has slightly different semantics than the sysctl, for historical 778 * reasons. 779 */ 780 static void 781 ipreass_drain_tomax(void) 782 { 783 struct ipq *fp; 784 int target; 785 786 /* 787 * Make sure each bucket is under the new limit. If 788 * necessary, drop enough of the oldest elements from 789 * each bucket to get under the new limit. 790 */ 791 for (int i = 0; i < V_ipq_hashsize; i++) { 792 IPQ_LOCK(i); 793 while (V_ipq[i].count > V_ipreass_maxbucketsize && 794 (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) 795 ipq_timeout(&V_ipq[i], fp); 796 ipreass_reschedule(&V_ipq[i]); 797 IPQ_UNLOCK(i); 798 } 799 800 /* 801 * If we are over the maximum number of fragments, 802 * drain off enough to get down to the new limit, 803 * stripping off last elements on queues. Every 804 * run we strip the oldest element from each bucket. 805 */ 806 target = uma_zone_get_max(V_ipq_zone); 807 while (uma_zone_get_cur(V_ipq_zone) > target) { 808 for (int i = 0; i < V_ipq_hashsize; i++) { 809 IPQ_LOCK(i); 810 fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); 811 if (fp != NULL) { 812 ipq_timeout(&V_ipq[i], fp); 813 ipreass_reschedule(&V_ipq[i]); 814 } 815 IPQ_UNLOCK(i); 816 } 817 } 818 } 819 820 static void 821 ipreass_zone_change(void *tag) 822 { 823 VNET_ITERATOR_DECL(vnet_iter); 824 int max; 825 826 maxfrags = IP_MAXFRAGS; 827 max = IP_MAXFRAGPACKETS; 828 VNET_LIST_RLOCK_NOSLEEP(); 829 VNET_FOREACH(vnet_iter) { 830 CURVNET_SET(vnet_iter); 831 max = uma_zone_set_max(V_ipq_zone, max); 832 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 833 ipreass_drain_tomax(); 834 CURVNET_RESTORE(); 835 } 836 VNET_LIST_RUNLOCK_NOSLEEP(); 837 } 838 839 /* 840 * Change the limit on the UMA zone, or disable the fragment allocation 841 * at all. Since 0 and -1 is a special values here, we need our own handler, 842 * instead of sysctl_handle_uma_zone_max(). 843 */ 844 static int 845 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) 846 { 847 int error, max; 848 849 if (V_noreass == 0) { 850 max = uma_zone_get_max(V_ipq_zone); 851 if (max == 0) 852 max = -1; 853 } else 854 max = 0; 855 error = sysctl_handle_int(oidp, &max, 0, req); 856 if (error || !req->newptr) 857 return (error); 858 if (max > 0) { 859 /* 860 * XXXRW: Might be a good idea to sanity check the argument 861 * and place an extreme upper bound. 862 */ 863 max = uma_zone_set_max(V_ipq_zone, max); 864 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 865 ipreass_drain_tomax(); 866 V_noreass = 0; 867 } else if (max == 0) { 868 V_noreass = 1; 869 ipreass_drain(); 870 } else if (max == -1) { 871 V_noreass = 0; 872 uma_zone_set_max(V_ipq_zone, 0); 873 V_ipreass_maxbucketsize = INT_MAX; 874 } else 875 return (EINVAL); 876 return (0); 877 } 878 879 /* 880 * Seek for old fragment queue header that can be reused. Try to 881 * reuse a header from currently locked hash bucket. 882 */ 883 static struct ipq * 884 ipq_reuse(int start) 885 { 886 struct ipq *fp; 887 int bucket, i; 888 889 IPQ_LOCK_ASSERT(start); 890 891 for (i = 0; i < V_ipq_hashsize; i++) { 892 bucket = (start + i) % V_ipq_hashsize; 893 if (bucket != start && IPQ_TRYLOCK(bucket) == 0) 894 continue; 895 fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); 896 if (fp) { 897 struct mbuf *m; 898 899 IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); 900 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 901 while (fp->ipq_frags) { 902 m = fp->ipq_frags; 903 fp->ipq_frags = m->m_nextpkt; 904 m_freem(m); 905 } 906 TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); 907 V_ipq[bucket].count--; 908 ipreass_reschedule(&V_ipq[bucket]); 909 if (bucket != start) 910 IPQ_UNLOCK(bucket); 911 break; 912 } 913 if (bucket != start) 914 IPQ_UNLOCK(bucket); 915 } 916 IPQ_LOCK_ASSERT(start); 917 return (fp); 918 } 919 920 /* 921 * Free a fragment reassembly header and all associated datagrams. 922 */ 923 static void 924 ipq_free(struct ipqbucket *bucket, struct ipq *fp) 925 { 926 struct mbuf *q; 927 928 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 929 while (fp->ipq_frags) { 930 q = fp->ipq_frags; 931 fp->ipq_frags = q->m_nextpkt; 932 m_freem(q); 933 } 934 TAILQ_REMOVE(&bucket->head, fp, ipq_list); 935 bucket->count--; 936 uma_zfree(V_ipq_zone, fp); 937 } 938 939 /* 940 * Get or set the maximum number of reassembly queues per bucket. 941 */ 942 static int 943 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) 944 { 945 int error, max; 946 947 max = V_ipreass_maxbucketsize; 948 error = sysctl_handle_int(oidp, &max, 0, req); 949 if (error || !req->newptr) 950 return (error); 951 if (max <= 0) 952 return (EINVAL); 953 V_ipreass_maxbucketsize = max; 954 ipreass_drain_tomax(); 955 return (0); 956 } 957 958 /* 959 * Get or set the IP fragment time to live. 960 */ 961 static int 962 sysctl_fragttl(SYSCTL_HANDLER_ARGS) 963 { 964 u_int ttl; 965 int error; 966 967 ttl = V_ipfragttl; 968 error = sysctl_handle_int(oidp, &ttl, 0, req); 969 if (error || !req->newptr) 970 return (error); 971 972 if (ttl < 1 || ttl > MAXTTL) 973 return (EINVAL); 974 975 atomic_store_int(&V_ipfragttl, ttl); 976 return (0); 977 } 978