1 /*- 2 * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org> 3 * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org> 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include "opt_rss.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/eventhandler.h> 37 #include <sys/kernel.h> 38 #include <sys/hash.h> 39 #include <sys/mbuf.h> 40 #include <sys/malloc.h> 41 #include <sys/limits.h> 42 #include <sys/lock.h> 43 #include <sys/mutex.h> 44 #include <sys/sysctl.h> 45 #include <sys/socket.h> 46 47 #include <net/if.h> 48 #include <net/if_var.h> 49 #include <net/if_private.h> 50 #include <net/rss_config.h> 51 #include <net/netisr.h> 52 #include <net/vnet.h> 53 54 #include <netinet/in.h> 55 #include <netinet/ip.h> 56 #include <netinet/ip_var.h> 57 #include <netinet/in_rss.h> 58 #ifdef MAC 59 #include <security/mac/mac_framework.h> 60 #endif 61 62 SYSCTL_DECL(_net_inet_ip); 63 64 /* 65 * Reassembly headers are stored in hash buckets. 66 */ 67 #define IPREASS_NHASH_LOG2 10 68 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 69 #define IPREASS_HMASK (V_ipq_hashsize - 1) 70 71 struct ipqbucket { 72 TAILQ_HEAD(ipqhead, ipq) head; 73 struct mtx lock; 74 struct callout timer; 75 #ifdef VIMAGE 76 struct vnet *vnet; 77 #endif 78 int count; 79 }; 80 81 VNET_DEFINE_STATIC(struct ipqbucket *, ipq); 82 #define V_ipq VNET(ipq) 83 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); 84 #define V_ipq_hashseed VNET(ipq_hashseed) 85 VNET_DEFINE_STATIC(uint32_t, ipq_hashsize); 86 #define V_ipq_hashsize VNET(ipq_hashsize) 87 88 #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) 89 #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) 90 #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) 91 #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) 92 #define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED) 93 94 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); 95 #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) 96 97 void ipreass_init(void); 98 void ipreass_vnet_init(void); 99 #ifdef VIMAGE 100 void ipreass_destroy(void); 101 #endif 102 static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); 103 static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); 104 static int sysctl_fragttl(SYSCTL_HANDLER_ARGS); 105 static void ipreass_zone_change(void *); 106 static void ipreass_drain_tomax(void); 107 static void ipq_free(struct ipqbucket *, struct ipq *); 108 static struct ipq * ipq_reuse(int); 109 static void ipreass_callout(void *); 110 static void ipreass_reschedule(struct ipqbucket *); 111 112 static inline void 113 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) 114 { 115 116 IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); 117 ipq_free(bucket, fp); 118 } 119 120 static inline void 121 ipq_drop(struct ipqbucket *bucket, struct ipq *fp) 122 { 123 124 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 125 ipq_free(bucket, fp); 126 ipreass_reschedule(bucket); 127 } 128 129 /* 130 * By default, limit the number of IP fragments across all reassembly 131 * queues to 1/32 of the total number of mbuf clusters. 132 * 133 * Limit the total number of reassembly queues per VNET to the 134 * IP fragment limit, but ensure the limit will not allow any bucket 135 * to grow above 100 items. (The bucket limit is 136 * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct 137 * multiplier to reach a 100-item limit.) 138 * The 100-item limit was chosen as brief testing seems to show that 139 * this produces "reasonable" performance on some subset of systems 140 * under DoS attack. 141 */ 142 #define IP_MAXFRAGS (nmbclusters / 32) 143 #define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, V_ipq_hashsize * 50)) 144 145 static int maxfrags; 146 static u_int __exclusive_cache_line nfrags; 147 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, 148 &maxfrags, 0, 149 "Maximum number of IPv4 fragments allowed across all reassembly queues"); 150 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, 151 &nfrags, 0, 152 "Current number of IPv4 fragments across all reassembly queues"); 153 154 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); 155 #define V_ipq_zone VNET(ipq_zone) 156 157 SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize, 158 CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0, 159 "Size of IP fragment reassembly hashtable"); 160 161 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, 162 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 163 NULL, 0, sysctl_maxfragpackets, "I", 164 "Maximum number of IPv4 fragment reassembly queue entries"); 165 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, 166 &VNET_NAME(ipq_zone), 167 "Current number of IPv4 fragment reassembly queue entries"); 168 169 VNET_DEFINE_STATIC(int, noreass); 170 #define V_noreass VNET(noreass) 171 172 VNET_DEFINE_STATIC(int, maxfragsperpacket); 173 #define V_maxfragsperpacket VNET(maxfragsperpacket) 174 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, 175 &VNET_NAME(maxfragsperpacket), 0, 176 "Maximum number of IPv4 fragments allowed per packet"); 177 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, 178 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, 179 sysctl_maxfragbucketsize, "I", 180 "Maximum number of IPv4 fragment reassembly queue entries per bucket"); 181 182 VNET_DEFINE_STATIC(u_int, ipfragttl) = 30; 183 #define V_ipfragttl VNET(ipfragttl) 184 SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW | 185 CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU", 186 "IP fragment life time on reassembly queue (seconds)"); 187 188 /* 189 * Take incoming datagram fragment and try to reassemble it into 190 * whole datagram. If the argument is the first fragment or one 191 * in between the function will return NULL and store the mbuf 192 * in the fragment chain. If the argument is the last fragment 193 * the packet will be reassembled and the pointer to the new 194 * mbuf returned for further processing. Only m_tags attached 195 * to the first packet/fragment are preserved. 196 * The IP header is *NOT* adjusted out of iplen. 197 */ 198 #define M_IP_FRAG M_PROTO9 199 struct mbuf * 200 ip_reass(struct mbuf *m) 201 { 202 struct ip *ip; 203 struct mbuf *p, *q, *nq, *t; 204 struct ipq *fp; 205 struct ifnet *srcifp; 206 struct ipqhead *head; 207 int i, hlen, next, tmpmax; 208 u_int8_t ecn, ecn0; 209 uint32_t hash, hashkey[3]; 210 #ifdef RSS 211 uint32_t rss_hash, rss_type; 212 #endif 213 214 /* 215 * If no reassembling or maxfragsperpacket are 0, 216 * never accept fragments. 217 * Also, drop packet if it would exceed the maximum 218 * number of fragments. 219 */ 220 tmpmax = maxfrags; 221 if (V_noreass == 1 || V_maxfragsperpacket == 0 || 222 (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { 223 IPSTAT_INC(ips_fragments); 224 IPSTAT_INC(ips_fragdropped); 225 m_freem(m); 226 return (NULL); 227 } 228 229 ip = mtod(m, struct ip *); 230 hlen = ip->ip_hl << 2; 231 232 /* 233 * Adjust ip_len to not reflect header, 234 * convert offset of this to bytes. 235 */ 236 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 237 /* 238 * Make sure that fragments have a data length 239 * that's a non-zero multiple of 8 bytes, unless 240 * this is the last fragment. 241 */ 242 if (ip->ip_len == htons(0) || 243 ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { 244 IPSTAT_INC(ips_toosmall); /* XXX */ 245 IPSTAT_INC(ips_fragdropped); 246 m_freem(m); 247 return (NULL); 248 } 249 if (ip->ip_off & htons(IP_MF)) 250 m->m_flags |= M_IP_FRAG; 251 else 252 m->m_flags &= ~M_IP_FRAG; 253 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 254 255 /* 256 * Make sure the fragment lies within a packet of valid size. 257 */ 258 if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) { 259 IPSTAT_INC(ips_toolong); 260 IPSTAT_INC(ips_fragdropped); 261 m_freem(m); 262 return (NULL); 263 } 264 265 /* 266 * Store receive network interface pointer for later. 267 */ 268 srcifp = m->m_pkthdr.rcvif; 269 270 /* 271 * Attempt reassembly; if it succeeds, proceed. 272 * ip_reass() will return a different mbuf. 273 */ 274 IPSTAT_INC(ips_fragments); 275 m->m_pkthdr.PH_loc.ptr = ip; 276 277 /* 278 * Presence of header sizes in mbufs 279 * would confuse code below. 280 */ 281 m->m_data += hlen; 282 m->m_len -= hlen; 283 284 hashkey[0] = ip->ip_src.s_addr; 285 hashkey[1] = ip->ip_dst.s_addr; 286 hashkey[2] = (uint32_t)ip->ip_p << 16; 287 hashkey[2] += ip->ip_id; 288 hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); 289 hash &= IPREASS_HMASK; 290 head = &V_ipq[hash].head; 291 IPQ_LOCK(hash); 292 293 /* 294 * Look for queue of fragments 295 * of this datagram. 296 */ 297 TAILQ_FOREACH(fp, head, ipq_list) 298 if (ip->ip_id == fp->ipq_id && 299 ip->ip_src.s_addr == fp->ipq_src.s_addr && 300 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 301 #ifdef MAC 302 mac_ipq_match(m, fp) && 303 #endif 304 ip->ip_p == fp->ipq_p) 305 break; 306 /* 307 * If first fragment to arrive, create a reassembly queue. 308 */ 309 if (fp == NULL) { 310 if (V_ipq[hash].count < V_ipreass_maxbucketsize) 311 fp = uma_zalloc(V_ipq_zone, M_NOWAIT); 312 if (fp == NULL) 313 fp = ipq_reuse(hash); 314 if (fp == NULL) 315 goto dropfrag; 316 #ifdef MAC 317 if (mac_ipq_init(fp, M_NOWAIT) != 0) { 318 uma_zfree(V_ipq_zone, fp); 319 fp = NULL; 320 goto dropfrag; 321 } 322 mac_ipq_create(m, fp); 323 #endif 324 TAILQ_INSERT_HEAD(head, fp, ipq_list); 325 V_ipq[hash].count++; 326 fp->ipq_nfrags = 1; 327 atomic_add_int(&nfrags, 1); 328 fp->ipq_expire = time_uptime + V_ipfragttl; 329 fp->ipq_p = ip->ip_p; 330 fp->ipq_id = ip->ip_id; 331 fp->ipq_src = ip->ip_src; 332 fp->ipq_dst = ip->ip_dst; 333 fp->ipq_frags = m; 334 if (m->m_flags & M_IP_FRAG) 335 fp->ipq_maxoff = -1; 336 else 337 fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); 338 m->m_nextpkt = NULL; 339 if (fp == TAILQ_LAST(head, ipqhead)) 340 callout_reset_sbt(&V_ipq[hash].timer, 341 SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout, 342 &V_ipq[hash], 0); 343 else 344 MPASS(callout_active(&V_ipq[hash].timer)); 345 goto done; 346 } else { 347 /* 348 * If we already saw the last fragment, make sure 349 * this fragment's offset looks sane. Otherwise, if 350 * this is the last fragment, record its endpoint. 351 */ 352 if (fp->ipq_maxoff > 0) { 353 i = ntohs(ip->ip_off) + ntohs(ip->ip_len); 354 if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || 355 ((m->m_flags & M_IP_FRAG) == 0 && 356 i != fp->ipq_maxoff)) { 357 fp = NULL; 358 goto dropfrag; 359 } 360 } else if ((m->m_flags & M_IP_FRAG) == 0) 361 fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); 362 fp->ipq_nfrags++; 363 atomic_add_int(&nfrags, 1); 364 #ifdef MAC 365 mac_ipq_update(m, fp); 366 #endif 367 } 368 369 #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) 370 371 /* 372 * Handle ECN by comparing this segment with the first one; 373 * if CE is set, do not lose CE. 374 * drop if CE and not-ECT are mixed for the same packet. 375 */ 376 ecn = ip->ip_tos & IPTOS_ECN_MASK; 377 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 378 if (ecn == IPTOS_ECN_CE) { 379 if (ecn0 == IPTOS_ECN_NOTECT) 380 goto dropfrag; 381 if (ecn0 != IPTOS_ECN_CE) 382 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 383 } 384 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 385 goto dropfrag; 386 387 /* 388 * Find a segment which begins after this one does. 389 */ 390 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 391 if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) 392 break; 393 394 /* 395 * If there is a preceding segment, it may provide some of 396 * our data already. If so, drop the data from the incoming 397 * segment. If it provides all of our data, drop us, otherwise 398 * stick new segment in the proper place. 399 * 400 * If some of the data is dropped from the preceding 401 * segment, then it's checksum is invalidated. 402 */ 403 if (p) { 404 i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - 405 ntohs(ip->ip_off); 406 if (i > 0) { 407 if (i >= ntohs(ip->ip_len)) 408 goto dropfrag; 409 m_adj(m, i); 410 m->m_pkthdr.csum_flags = 0; 411 ip->ip_off = htons(ntohs(ip->ip_off) + i); 412 ip->ip_len = htons(ntohs(ip->ip_len) - i); 413 } 414 m->m_nextpkt = p->m_nextpkt; 415 p->m_nextpkt = m; 416 } else { 417 m->m_nextpkt = fp->ipq_frags; 418 fp->ipq_frags = m; 419 } 420 421 /* 422 * While we overlap succeeding segments trim them or, 423 * if they are completely covered, dequeue them. 424 */ 425 for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > 426 ntohs(GETIP(q)->ip_off); q = nq) { 427 i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - 428 ntohs(GETIP(q)->ip_off); 429 if (i < ntohs(GETIP(q)->ip_len)) { 430 GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); 431 GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); 432 m_adj(q, i); 433 q->m_pkthdr.csum_flags = 0; 434 break; 435 } 436 nq = q->m_nextpkt; 437 m->m_nextpkt = nq; 438 IPSTAT_INC(ips_fragdropped); 439 fp->ipq_nfrags--; 440 atomic_subtract_int(&nfrags, 1); 441 m_freem(q); 442 } 443 444 /* 445 * Check for complete reassembly and perform frag per packet 446 * limiting. 447 * 448 * Frag limiting is performed here so that the nth frag has 449 * a chance to complete the packet before we drop the packet. 450 * As a result, n+1 frags are actually allowed per packet, but 451 * only n will ever be stored. (n = maxfragsperpacket.) 452 * 453 */ 454 next = 0; 455 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 456 if (ntohs(GETIP(q)->ip_off) != next) { 457 if (fp->ipq_nfrags > V_maxfragsperpacket) 458 ipq_drop(&V_ipq[hash], fp); 459 goto done; 460 } 461 next += ntohs(GETIP(q)->ip_len); 462 } 463 /* Make sure the last packet didn't have the IP_MF flag */ 464 if (p->m_flags & M_IP_FRAG) { 465 if (fp->ipq_nfrags > V_maxfragsperpacket) 466 ipq_drop(&V_ipq[hash], fp); 467 goto done; 468 } 469 470 /* 471 * Reassembly is complete. Make sure the packet is a sane size. 472 */ 473 q = fp->ipq_frags; 474 ip = GETIP(q); 475 if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { 476 IPSTAT_INC(ips_toolong); 477 ipq_drop(&V_ipq[hash], fp); 478 goto done; 479 } 480 481 /* 482 * Concatenate fragments. 483 */ 484 m = q; 485 t = m->m_next; 486 m->m_next = NULL; 487 m_cat(m, t); 488 nq = q->m_nextpkt; 489 q->m_nextpkt = NULL; 490 for (q = nq; q != NULL; q = nq) { 491 nq = q->m_nextpkt; 492 q->m_nextpkt = NULL; 493 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; 494 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; 495 m_demote_pkthdr(q); 496 m_cat(m, q); 497 } 498 /* 499 * In order to do checksumming faster we do 'end-around carry' here 500 * (and not in for{} loop), though it implies we are not going to 501 * reassemble more than 64k fragments. 502 */ 503 while (m->m_pkthdr.csum_data & 0xffff0000) 504 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + 505 (m->m_pkthdr.csum_data >> 16); 506 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 507 #ifdef MAC 508 mac_ipq_reassemble(fp, m); 509 mac_ipq_destroy(fp); 510 #endif 511 512 /* 513 * Create header for new ip packet by modifying header of first 514 * packet; dequeue and discard fragment reassembly header. 515 * Make header visible. 516 */ 517 ip->ip_len = htons((ip->ip_hl << 2) + next); 518 ip->ip_src = fp->ipq_src; 519 ip->ip_dst = fp->ipq_dst; 520 TAILQ_REMOVE(head, fp, ipq_list); 521 V_ipq[hash].count--; 522 uma_zfree(V_ipq_zone, fp); 523 m->m_len += (ip->ip_hl << 2); 524 m->m_data -= (ip->ip_hl << 2); 525 /* some debugging cruft by sklower, below, will go away soon */ 526 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ 527 m_fixhdr(m); 528 /* set valid receive interface pointer */ 529 m->m_pkthdr.rcvif = srcifp; 530 } 531 IPSTAT_INC(ips_reassembled); 532 ipreass_reschedule(&V_ipq[hash]); 533 IPQ_UNLOCK(hash); 534 535 #ifdef RSS 536 /* 537 * Query the RSS layer for the flowid / flowtype for the 538 * mbuf payload. 539 * 540 * For now, just assume we have to calculate a new one. 541 * Later on we should check to see if the assigned flowid matches 542 * what RSS wants for the given IP protocol and if so, just keep it. 543 * 544 * We then queue into the relevant netisr so it can be dispatched 545 * to the correct CPU. 546 * 547 * Note - this may return 1, which means the flowid in the mbuf 548 * is correct for the configured RSS hash types and can be used. 549 */ 550 if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { 551 m->m_pkthdr.flowid = rss_hash; 552 M_HASHTYPE_SET(m, rss_type); 553 } 554 555 /* 556 * Queue/dispatch for reprocessing. 557 * 558 * Note: this is much slower than just handling the frame in the 559 * current receive context. It's likely worth investigating 560 * why this is. 561 */ 562 netisr_dispatch(NETISR_IP_DIRECT, m); 563 return (NULL); 564 #endif 565 566 /* Handle in-line */ 567 return (m); 568 569 dropfrag: 570 IPSTAT_INC(ips_fragdropped); 571 if (fp != NULL) { 572 fp->ipq_nfrags--; 573 atomic_subtract_int(&nfrags, 1); 574 } 575 m_freem(m); 576 done: 577 IPQ_UNLOCK(hash); 578 return (NULL); 579 580 #undef GETIP 581 } 582 583 /* 584 * Timer expired on a bucket. 585 * There should be at least one ipq to be timed out. 586 */ 587 static void 588 ipreass_callout(void *arg) 589 { 590 struct ipqbucket *bucket = arg; 591 struct ipq *fp; 592 593 IPQ_BUCKET_LOCK_ASSERT(bucket); 594 MPASS(atomic_load_int(&nfrags) > 0); 595 596 CURVNET_SET(bucket->vnet); 597 fp = TAILQ_LAST(&bucket->head, ipqhead); 598 KASSERT(fp != NULL && fp->ipq_expire <= time_uptime, 599 ("%s: stray callout on bucket %p, %ju < %ju", __func__, bucket, 600 fp ? (uintmax_t)fp->ipq_expire : 0, (uintmax_t)time_uptime)); 601 602 while (fp != NULL && fp->ipq_expire <= time_uptime) { 603 ipq_timeout(bucket, fp); 604 fp = TAILQ_LAST(&bucket->head, ipqhead); 605 } 606 ipreass_reschedule(bucket); 607 CURVNET_RESTORE(); 608 } 609 610 static void 611 ipreass_reschedule(struct ipqbucket *bucket) 612 { 613 struct ipq *fp; 614 615 IPQ_BUCKET_LOCK_ASSERT(bucket); 616 617 if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) { 618 time_t t; 619 620 /* Protect against time_uptime tick. */ 621 t = fp->ipq_expire - time_uptime; 622 t = (t > 0) ? t : 1; 623 callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S, 624 ipreass_callout, bucket, 0); 625 } else 626 callout_stop(&bucket->timer); 627 } 628 629 static void 630 ipreass_drain_vnet(void) 631 { 632 u_int dropped = 0; 633 634 for (int i = 0; i < V_ipq_hashsize; i++) { 635 bool resched; 636 637 IPQ_LOCK(i); 638 resched = !TAILQ_EMPTY(&V_ipq[i].head); 639 while(!TAILQ_EMPTY(&V_ipq[i].head)) { 640 struct ipq *fp = TAILQ_FIRST(&V_ipq[i].head); 641 642 dropped += fp->ipq_nfrags; 643 ipq_free(&V_ipq[i], fp); 644 } 645 if (resched) 646 ipreass_reschedule(&V_ipq[i]); 647 KASSERT(V_ipq[i].count == 0, 648 ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, 649 V_ipq[i].count, V_ipq)); 650 IPQ_UNLOCK(i); 651 } 652 IPSTAT_ADD(ips_fragdropped, dropped); 653 } 654 655 /* 656 * Drain off all datagram fragments. 657 */ 658 static void 659 ipreass_drain(void) 660 { 661 VNET_ITERATOR_DECL(vnet_iter); 662 663 VNET_LIST_RLOCK(); 664 VNET_FOREACH(vnet_iter) { 665 CURVNET_SET(vnet_iter); 666 ipreass_drain_vnet(); 667 CURVNET_RESTORE(); 668 } 669 VNET_LIST_RUNLOCK(); 670 } 671 672 static void 673 ipreass_drain_lowmem(void *arg __unused, int flags __unused) 674 { 675 ipreass_drain(); 676 } 677 678 /* 679 * Initialize IP reassembly structures. 680 */ 681 MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers"); 682 void 683 ipreass_vnet_init(void) 684 { 685 int max; 686 687 V_ipq_hashsize = IPREASS_NHASH; 688 TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize); 689 V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize, 690 M_IPREASS_HASH, M_WAITOK); 691 692 for (int i = 0; i < V_ipq_hashsize; i++) { 693 TAILQ_INIT(&V_ipq[i].head); 694 mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, 695 MTX_DEF | MTX_DUPOK | MTX_NEW); 696 callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0); 697 V_ipq[i].count = 0; 698 #ifdef VIMAGE 699 V_ipq[i].vnet = curvnet; 700 #endif 701 } 702 V_ipq_hashseed = arc4random(); 703 V_maxfragsperpacket = 16; 704 V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, 705 NULL, UMA_ALIGN_PTR, 0); 706 max = IP_MAXFRAGPACKETS; 707 max = uma_zone_set_max(V_ipq_zone, max); 708 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 709 } 710 711 void 712 ipreass_init(void) 713 { 714 715 maxfrags = IP_MAXFRAGS; 716 EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, 717 NULL, EVENTHANDLER_PRI_ANY); 718 EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain_lowmem, NULL, 719 LOWMEM_PRI_DEFAULT); 720 EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain_lowmem, NULL, 721 LOWMEM_PRI_DEFAULT); 722 } 723 724 /* 725 * Drain off all datagram fragments belonging to 726 * the given network interface. 727 */ 728 static void 729 ipreass_cleanup(void *arg __unused, struct ifnet *ifp) 730 { 731 struct ipq *fp, *temp; 732 struct mbuf *m; 733 int i; 734 735 KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); 736 737 CURVNET_SET_QUIET(ifp->if_vnet); 738 739 /* 740 * Skip processing if IPv4 reassembly is not initialised or 741 * torn down by ipreass_destroy(). 742 */ 743 if (V_ipq_zone == NULL) { 744 CURVNET_RESTORE(); 745 return; 746 } 747 748 for (i = 0; i < V_ipq_hashsize; i++) { 749 IPQ_LOCK(i); 750 /* Scan fragment list. */ 751 TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { 752 for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { 753 /* clear no longer valid rcvif pointer */ 754 if (m->m_pkthdr.rcvif == ifp) 755 m->m_pkthdr.rcvif = NULL; 756 } 757 } 758 IPQ_UNLOCK(i); 759 } 760 CURVNET_RESTORE(); 761 } 762 EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); 763 764 #ifdef VIMAGE 765 /* 766 * Destroy IP reassembly structures. 767 */ 768 void 769 ipreass_destroy(void) 770 { 771 772 ipreass_drain_vnet(); 773 uma_zdestroy(V_ipq_zone); 774 V_ipq_zone = NULL; 775 for (int i = 0; i < V_ipq_hashsize; i++) 776 mtx_destroy(&V_ipq[i].lock); 777 free(V_ipq, M_IPREASS_HASH); 778 } 779 #endif 780 781 /* 782 * After maxnipq has been updated, propagate the change to UMA. The UMA zone 783 * max has slightly different semantics than the sysctl, for historical 784 * reasons. 785 */ 786 static void 787 ipreass_drain_tomax(void) 788 { 789 struct ipq *fp; 790 int target; 791 792 /* 793 * Make sure each bucket is under the new limit. If 794 * necessary, drop enough of the oldest elements from 795 * each bucket to get under the new limit. 796 */ 797 for (int i = 0; i < V_ipq_hashsize; i++) { 798 IPQ_LOCK(i); 799 while (V_ipq[i].count > V_ipreass_maxbucketsize && 800 (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) 801 ipq_timeout(&V_ipq[i], fp); 802 ipreass_reschedule(&V_ipq[i]); 803 IPQ_UNLOCK(i); 804 } 805 806 /* 807 * If we are over the maximum number of fragments, 808 * drain off enough to get down to the new limit, 809 * stripping off last elements on queues. Every 810 * run we strip the oldest element from each bucket. 811 */ 812 target = uma_zone_get_max(V_ipq_zone); 813 while (uma_zone_get_cur(V_ipq_zone) > target) { 814 for (int i = 0; i < V_ipq_hashsize; i++) { 815 IPQ_LOCK(i); 816 fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); 817 if (fp != NULL) { 818 ipq_timeout(&V_ipq[i], fp); 819 ipreass_reschedule(&V_ipq[i]); 820 } 821 IPQ_UNLOCK(i); 822 } 823 } 824 } 825 826 static void 827 ipreass_zone_change(void *tag) 828 { 829 VNET_ITERATOR_DECL(vnet_iter); 830 int max; 831 832 maxfrags = IP_MAXFRAGS; 833 max = IP_MAXFRAGPACKETS; 834 VNET_LIST_RLOCK_NOSLEEP(); 835 VNET_FOREACH(vnet_iter) { 836 CURVNET_SET(vnet_iter); 837 max = uma_zone_set_max(V_ipq_zone, max); 838 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 839 ipreass_drain_tomax(); 840 CURVNET_RESTORE(); 841 } 842 VNET_LIST_RUNLOCK_NOSLEEP(); 843 } 844 845 /* 846 * Change the limit on the UMA zone, or disable the fragment allocation 847 * at all. Since 0 and -1 is a special values here, we need our own handler, 848 * instead of sysctl_handle_uma_zone_max(). 849 */ 850 static int 851 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) 852 { 853 int error, max; 854 855 if (V_noreass == 0) { 856 max = uma_zone_get_max(V_ipq_zone); 857 if (max == 0) 858 max = -1; 859 } else 860 max = 0; 861 error = sysctl_handle_int(oidp, &max, 0, req); 862 if (error || !req->newptr) 863 return (error); 864 if (max > 0) { 865 /* 866 * XXXRW: Might be a good idea to sanity check the argument 867 * and place an extreme upper bound. 868 */ 869 max = uma_zone_set_max(V_ipq_zone, max); 870 V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); 871 ipreass_drain_tomax(); 872 V_noreass = 0; 873 } else if (max == 0) { 874 V_noreass = 1; 875 ipreass_drain(); 876 } else if (max == -1) { 877 V_noreass = 0; 878 uma_zone_set_max(V_ipq_zone, 0); 879 V_ipreass_maxbucketsize = INT_MAX; 880 } else 881 return (EINVAL); 882 return (0); 883 } 884 885 /* 886 * Seek for old fragment queue header that can be reused. Try to 887 * reuse a header from currently locked hash bucket. 888 */ 889 static struct ipq * 890 ipq_reuse(int start) 891 { 892 struct ipq *fp; 893 int bucket, i; 894 895 IPQ_LOCK_ASSERT(start); 896 897 for (i = 0; i < V_ipq_hashsize; i++) { 898 bucket = (start + i) % V_ipq_hashsize; 899 if (bucket != start && IPQ_TRYLOCK(bucket) == 0) 900 continue; 901 fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); 902 if (fp) { 903 struct mbuf *m; 904 905 IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); 906 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 907 while (fp->ipq_frags) { 908 m = fp->ipq_frags; 909 fp->ipq_frags = m->m_nextpkt; 910 m_freem(m); 911 } 912 TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); 913 V_ipq[bucket].count--; 914 ipreass_reschedule(&V_ipq[bucket]); 915 if (bucket != start) 916 IPQ_UNLOCK(bucket); 917 break; 918 } 919 if (bucket != start) 920 IPQ_UNLOCK(bucket); 921 } 922 IPQ_LOCK_ASSERT(start); 923 return (fp); 924 } 925 926 /* 927 * Free a fragment reassembly header and all associated datagrams. 928 */ 929 static void 930 ipq_free(struct ipqbucket *bucket, struct ipq *fp) 931 { 932 struct mbuf *q; 933 934 atomic_subtract_int(&nfrags, fp->ipq_nfrags); 935 while (fp->ipq_frags) { 936 q = fp->ipq_frags; 937 fp->ipq_frags = q->m_nextpkt; 938 m_freem(q); 939 } 940 TAILQ_REMOVE(&bucket->head, fp, ipq_list); 941 bucket->count--; 942 uma_zfree(V_ipq_zone, fp); 943 } 944 945 /* 946 * Get or set the maximum number of reassembly queues per bucket. 947 */ 948 static int 949 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) 950 { 951 int error, max; 952 953 max = V_ipreass_maxbucketsize; 954 error = sysctl_handle_int(oidp, &max, 0, req); 955 if (error || !req->newptr) 956 return (error); 957 if (max <= 0) 958 return (EINVAL); 959 V_ipreass_maxbucketsize = max; 960 ipreass_drain_tomax(); 961 return (0); 962 } 963 964 /* 965 * Get or set the IP fragment time to live. 966 */ 967 static int 968 sysctl_fragttl(SYSCTL_HANDLER_ARGS) 969 { 970 u_int ttl; 971 int error; 972 973 ttl = V_ipfragttl; 974 error = sysctl_handle_int(oidp, &ttl, 0, req); 975 if (error || !req->newptr) 976 return (error); 977 978 if (ttl < 1 || ttl > MAXTTL) 979 return (EINVAL); 980 981 atomic_store_int(&V_ipfragttl, ttl); 982 return (0); 983 } 984