1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_bootp.h" 36 #include "opt_ipfw.h" 37 #include "opt_ipstealth.h" 38 #include "opt_ipsec.h" 39 #include "opt_route.h" 40 #include "opt_carp.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/callout.h> 45 #include <sys/mbuf.h> 46 #include <sys/malloc.h> 47 #include <sys/domain.h> 48 #include <sys/protosw.h> 49 #include <sys/socket.h> 50 #include <sys/time.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/rwlock.h> 54 #include <sys/syslog.h> 55 #include <sys/sysctl.h> 56 #include <sys/vimage.h> 57 58 #include <net/pfil.h> 59 #include <net/if.h> 60 #include <net/if_types.h> 61 #include <net/if_var.h> 62 #include <net/if_dl.h> 63 #include <net/route.h> 64 #include <net/netisr.h> 65 #include <net/vnet.h> 66 #include <net/flowtable.h> 67 68 #include <netinet/in.h> 69 #include <netinet/in_systm.h> 70 #include <netinet/in_var.h> 71 #include <netinet/ip.h> 72 #include <netinet/in_pcb.h> 73 #include <netinet/ip_var.h> 74 #include <netinet/ip_icmp.h> 75 #include <netinet/ip_options.h> 76 #include <machine/in_cksum.h> 77 #include <netinet/vinet.h> 78 #ifdef DEV_CARP 79 #include <netinet/ip_carp.h> 80 #endif 81 #ifdef IPSEC 82 #include <netinet/ip_ipsec.h> 83 #endif /* IPSEC */ 84 85 #include <sys/socketvar.h> 86 87 #include <security/mac/mac_framework.h> 88 89 #ifdef CTASSERT 90 CTASSERT(sizeof(struct ip) == 20); 91 #endif 92 93 #ifndef VIMAGE 94 #ifndef VIMAGE_GLOBALS 95 struct vnet_inet vnet_inet_0; 96 #endif 97 #endif 98 99 #ifdef VIMAGE_GLOBALS 100 static int ipsendredirects; 101 static int ip_checkinterface; 102 static int ip_keepfaith; 103 static int ip_sendsourcequench; 104 int ip_defttl; 105 int ip_do_randomid; 106 int ipforwarding; 107 struct in_ifaddrhead in_ifaddrhead; /* first inet address */ 108 struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ 109 u_long in_ifaddrhmask; /* mask for hash table */ 110 struct ipstat ipstat; 111 static int ip_rsvp_on; 112 struct socket *ip_rsvpd; 113 int rsvp_on; 114 static struct ipqhead ipq[IPREASS_NHASH]; 115 static int maxnipq; /* Administrative limit on # reass queues. */ 116 static int maxfragsperpacket; 117 int ipstealth; 118 static int nipq; /* Total # of reass queues */ 119 #endif 120 121 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, 122 forwarding, CTLFLAG_RW, ipforwarding, 0, 123 "Enable IP forwarding between interfaces"); 124 125 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, 126 redirect, CTLFLAG_RW, ipsendredirects, 0, 127 "Enable sending IP redirects"); 128 129 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, 130 ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); 131 132 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, 133 keepfaith, CTLFLAG_RW, ip_keepfaith, 0, 134 "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); 135 136 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, 137 sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, 138 "Enable the transmission of source quench packets"); 139 140 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, 141 CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); 142 143 /* 144 * XXX - Setting ip_checkinterface mostly implements the receive side of 145 * the Strong ES model described in RFC 1122, but since the routing table 146 * and transmit implementation do not implement the Strong ES model, 147 * setting this to 1 results in an odd hybrid. 148 * 149 * XXX - ip_checkinterface currently must be disabled if you use ipnat 150 * to translate the destination address to another local interface. 151 * 152 * XXX - ip_checkinterface must be disabled if you add IP aliases 153 * to the loopback interface instead of the interface where the 154 * packets for those addresses are received. 155 */ 156 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, 157 check_interface, CTLFLAG_RW, ip_checkinterface, 0, 158 "Verify packet arrives on correct interface"); 159 160 struct pfil_head inet_pfil_hook; /* Packet filter hooks */ 161 162 static struct netisr_handler ip_nh = { 163 .nh_name = "ip", 164 .nh_handler = ip_input, 165 .nh_proto = NETISR_IP, 166 .nh_policy = NETISR_POLICY_FLOW, 167 }; 168 169 extern struct domain inetdomain; 170 extern struct protosw inetsw[]; 171 u_char ip_protox[IPPROTO_MAX]; 172 173 174 SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, 175 ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); 176 177 #ifdef VIMAGE_GLOBALS 178 static uma_zone_t ipq_zone; 179 #endif 180 static struct mtx ipqlock; 181 182 #define IPQ_LOCK() mtx_lock(&ipqlock) 183 #define IPQ_UNLOCK() mtx_unlock(&ipqlock) 184 #define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) 185 #define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) 186 187 static void maxnipq_update(void); 188 static void ipq_zone_change(void *); 189 190 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, 191 CTLFLAG_RD, nipq, 0, 192 "Current number of IPv4 fragment reassembly queue entries"); 193 194 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, 195 CTLFLAG_RW, maxfragsperpacket, 0, 196 "Maximum number of IPv4 fragments allowed per packet"); 197 198 struct callout ipport_tick_callout; 199 200 #ifdef IPCTL_DEFMTU 201 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, 202 &ip_mtu, 0, "Default MTU"); 203 #endif 204 205 #ifdef IPSTEALTH 206 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, 207 ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); 208 #endif 209 #ifdef FLOWTABLE 210 static int ip_output_flowtable_size = 2048; 211 TUNABLE_INT("net.inet.ip.output_flowtable_size", &ip_output_flowtable_size); 212 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, output_flowtable_size, 213 CTLFLAG_RDTUN, ip_output_flowtable_size, 2048, 214 "number of entries in the per-cpu output flow caches"); 215 216 struct flowtable *ip_ft; 217 #endif 218 219 #ifdef VIMAGE_GLOBALS 220 int fw_one_pass; 221 #endif 222 223 static void ip_freef(struct ipqhead *, struct ipq *); 224 225 #ifndef VIMAGE_GLOBALS 226 static void vnet_inet_register(void); 227 228 static const vnet_modinfo_t vnet_inet_modinfo = { 229 .vmi_id = VNET_MOD_INET, 230 .vmi_name = "inet", 231 .vmi_size = sizeof(struct vnet_inet) 232 }; 233 234 static void vnet_inet_register() 235 { 236 237 vnet_mod_register(&vnet_inet_modinfo); 238 } 239 240 SYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); 241 #endif 242 243 static int 244 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) 245 { 246 int error, qlimit; 247 248 netisr_getqlimit(&ip_nh, &qlimit); 249 error = sysctl_handle_int(oidp, &qlimit, 0, req); 250 if (error || !req->newptr) 251 return (error); 252 if (qlimit < 1) 253 return (EINVAL); 254 return (netisr_setqlimit(&ip_nh, qlimit)); 255 } 256 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, 257 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", 258 "Maximum size of the IP input queue"); 259 260 static int 261 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) 262 { 263 u_int64_t qdrops_long; 264 int error, qdrops; 265 266 netisr_getqdrops(&ip_nh, &qdrops_long); 267 qdrops = qdrops_long; 268 error = sysctl_handle_int(oidp, &qdrops, 0, req); 269 if (error || !req->newptr) 270 return (error); 271 if (qdrops != 0) 272 return (EINVAL); 273 netisr_clearqdrops(&ip_nh); 274 return (0); 275 } 276 277 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, 278 CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", 279 "Number of packets dropped from the IP input queue"); 280 281 /* 282 * IP initialization: fill in IP protocol switch table. 283 * All protocols not implemented in kernel go to raw IP protocol handler. 284 */ 285 void 286 ip_init(void) 287 { 288 INIT_VNET_INET(curvnet); 289 struct protosw *pr; 290 int i; 291 292 V_ipsendredirects = 1; /* XXX */ 293 V_ip_checkinterface = 0; 294 V_ip_keepfaith = 0; 295 V_ip_sendsourcequench = 0; 296 V_rsvp_on = 0; 297 V_ip_defttl = IPDEFTTL; 298 V_ip_do_randomid = 0; 299 V_ip_id = time_second & 0xffff; 300 V_ipforwarding = 0; 301 V_ipstealth = 0; 302 V_nipq = 0; /* Total # of reass queues */ 303 304 V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 305 V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 306 V_ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ 307 V_ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ 308 V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 309 V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 310 V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ 311 V_ipport_reservedlow = 0; 312 V_ipport_randomized = 1; /* user controlled via sysctl */ 313 V_ipport_randomcps = 10; /* user controlled via sysctl */ 314 V_ipport_randomtime = 45; /* user controlled via sysctl */ 315 V_ipport_stoprandom = 0; /* toggled by ipport_tick */ 316 317 V_fw_one_pass = 1; 318 319 #ifdef NOTYET 320 /* XXX global static but not instantiated in this file */ 321 V_ipfastforward_active = 0; 322 V_subnetsarelocal = 0; 323 V_sameprefixcarponly = 0; 324 #endif 325 326 TAILQ_INIT(&V_in_ifaddrhead); 327 V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); 328 329 /* Initialize IP reassembly queue. */ 330 for (i = 0; i < IPREASS_NHASH; i++) 331 TAILQ_INIT(&V_ipq[i]); 332 V_maxnipq = nmbclusters / 32; 333 V_maxfragsperpacket = 16; 334 V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, 335 NULL, UMA_ALIGN_PTR, 0); 336 maxnipq_update(); 337 338 /* Skip initialization of globals for non-default instances. */ 339 if (!IS_DEFAULT_VNET(curvnet)) 340 return; 341 342 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 343 if (pr == NULL) 344 panic("ip_init: PF_INET not found"); 345 346 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ 347 for (i = 0; i < IPPROTO_MAX; i++) 348 ip_protox[i] = pr - inetsw; 349 /* 350 * Cycle through IP protocols and put them into the appropriate place 351 * in ip_protox[]. 352 */ 353 for (pr = inetdomain.dom_protosw; 354 pr < inetdomain.dom_protoswNPROTOSW; pr++) 355 if (pr->pr_domain->dom_family == PF_INET && 356 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { 357 /* Be careful to only index valid IP protocols. */ 358 if (pr->pr_protocol < IPPROTO_MAX) 359 ip_protox[pr->pr_protocol] = pr - inetsw; 360 } 361 362 /* Initialize packet filter hooks. */ 363 inet_pfil_hook.ph_type = PFIL_TYPE_AF; 364 inet_pfil_hook.ph_af = AF_INET; 365 if ((i = pfil_head_register(&inet_pfil_hook)) != 0) 366 printf("%s: WARNING: unable to register pfil hook, " 367 "error %d\n", __func__, i); 368 369 /* Start ipport_tick. */ 370 callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); 371 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); 372 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 373 SHUTDOWN_PRI_DEFAULT); 374 EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, 375 NULL, EVENTHANDLER_PRI_ANY); 376 377 /* Initialize various other remaining things. */ 378 IPQ_LOCK_INIT(); 379 netisr_register(&ip_nh); 380 #ifdef FLOWTABLE 381 ip_ft = flowtable_alloc(ip_output_flowtable_size, FL_PCPU); 382 #endif 383 } 384 385 void 386 ip_fini(void *xtp) 387 { 388 389 callout_stop(&ipport_tick_callout); 390 } 391 392 /* 393 * Ip input routine. Checksum and byte swap header. If fragmented 394 * try to reassemble. Process options. Pass to next level. 395 */ 396 void 397 ip_input(struct mbuf *m) 398 { 399 INIT_VNET_INET(curvnet); 400 struct ip *ip = NULL; 401 struct in_ifaddr *ia = NULL; 402 struct ifaddr *ifa; 403 struct ifnet *ifp; 404 int checkif, hlen = 0; 405 u_short sum; 406 int dchg = 0; /* dest changed after fw */ 407 struct in_addr odst; /* original dst address */ 408 409 M_ASSERTPKTHDR(m); 410 411 if (m->m_flags & M_FASTFWD_OURS) { 412 /* 413 * Firewall or NAT changed destination to local. 414 * We expect ip_len and ip_off to be in host byte order. 415 */ 416 m->m_flags &= ~M_FASTFWD_OURS; 417 /* Set up some basics that will be used later. */ 418 ip = mtod(m, struct ip *); 419 hlen = ip->ip_hl << 2; 420 goto ours; 421 } 422 423 IPSTAT_INC(ips_total); 424 425 if (m->m_pkthdr.len < sizeof(struct ip)) 426 goto tooshort; 427 428 if (m->m_len < sizeof (struct ip) && 429 (m = m_pullup(m, sizeof (struct ip))) == NULL) { 430 IPSTAT_INC(ips_toosmall); 431 return; 432 } 433 ip = mtod(m, struct ip *); 434 435 if (ip->ip_v != IPVERSION) { 436 IPSTAT_INC(ips_badvers); 437 goto bad; 438 } 439 440 hlen = ip->ip_hl << 2; 441 if (hlen < sizeof(struct ip)) { /* minimum header length */ 442 IPSTAT_INC(ips_badhlen); 443 goto bad; 444 } 445 if (hlen > m->m_len) { 446 if ((m = m_pullup(m, hlen)) == NULL) { 447 IPSTAT_INC(ips_badhlen); 448 return; 449 } 450 ip = mtod(m, struct ip *); 451 } 452 453 /* 127/8 must not appear on wire - RFC1122 */ 454 ifp = m->m_pkthdr.rcvif; 455 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 456 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 457 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 458 IPSTAT_INC(ips_badaddr); 459 goto bad; 460 } 461 } 462 463 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 464 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); 465 } else { 466 if (hlen == sizeof(struct ip)) { 467 sum = in_cksum_hdr(ip); 468 } else { 469 sum = in_cksum(m, hlen); 470 } 471 } 472 if (sum) { 473 IPSTAT_INC(ips_badsum); 474 goto bad; 475 } 476 477 #ifdef ALTQ 478 if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) 479 /* packet is dropped by traffic conditioner */ 480 return; 481 #endif 482 483 /* 484 * Convert fields to host representation. 485 */ 486 ip->ip_len = ntohs(ip->ip_len); 487 if (ip->ip_len < hlen) { 488 IPSTAT_INC(ips_badlen); 489 goto bad; 490 } 491 ip->ip_off = ntohs(ip->ip_off); 492 493 /* 494 * Check that the amount of data in the buffers 495 * is as at least much as the IP header would have us expect. 496 * Trim mbufs if longer than we expect. 497 * Drop packet if shorter than we expect. 498 */ 499 if (m->m_pkthdr.len < ip->ip_len) { 500 tooshort: 501 IPSTAT_INC(ips_tooshort); 502 goto bad; 503 } 504 if (m->m_pkthdr.len > ip->ip_len) { 505 if (m->m_len == m->m_pkthdr.len) { 506 m->m_len = ip->ip_len; 507 m->m_pkthdr.len = ip->ip_len; 508 } else 509 m_adj(m, ip->ip_len - m->m_pkthdr.len); 510 } 511 #ifdef IPSEC 512 /* 513 * Bypass packet filtering for packets from a tunnel (gif). 514 */ 515 if (ip_ipsec_filtertunnel(m)) 516 goto passin; 517 #endif /* IPSEC */ 518 519 /* 520 * Run through list of hooks for input packets. 521 * 522 * NB: Beware of the destination address changing (e.g. 523 * by NAT rewriting). When this happens, tell 524 * ip_forward to do the right thing. 525 */ 526 527 /* Jump over all PFIL processing if hooks are not active. */ 528 if (!PFIL_HOOKED(&inet_pfil_hook)) 529 goto passin; 530 531 odst = ip->ip_dst; 532 if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) 533 return; 534 if (m == NULL) /* consumed by filter */ 535 return; 536 537 ip = mtod(m, struct ip *); 538 dchg = (odst.s_addr != ip->ip_dst.s_addr); 539 ifp = m->m_pkthdr.rcvif; 540 541 #ifdef IPFIREWALL_FORWARD 542 if (m->m_flags & M_FASTFWD_OURS) { 543 m->m_flags &= ~M_FASTFWD_OURS; 544 goto ours; 545 } 546 if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { 547 /* 548 * Directly ship on the packet. This allows to forward packets 549 * that were destined for us to some other directly connected 550 * host. 551 */ 552 ip_forward(m, dchg); 553 return; 554 } 555 #endif /* IPFIREWALL_FORWARD */ 556 557 passin: 558 /* 559 * Process options and, if not destined for us, 560 * ship it on. ip_dooptions returns 1 when an 561 * error was detected (causing an icmp message 562 * to be sent and the original packet to be freed). 563 */ 564 if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) 565 return; 566 567 /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no 568 * matter if it is destined to another node, or whether it is 569 * a multicast one, RSVP wants it! and prevents it from being forwarded 570 * anywhere else. Also checks if the rsvp daemon is running before 571 * grabbing the packet. 572 */ 573 if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 574 goto ours; 575 576 /* 577 * Check our list of addresses, to see if the packet is for us. 578 * If we don't have any addresses, assume any unicast packet 579 * we receive might be for us (and let the upper layers deal 580 * with it). 581 */ 582 if (TAILQ_EMPTY(&V_in_ifaddrhead) && 583 (m->m_flags & (M_MCAST|M_BCAST)) == 0) 584 goto ours; 585 586 /* 587 * Enable a consistency check between the destination address 588 * and the arrival interface for a unicast packet (the RFC 1122 589 * strong ES model) if IP forwarding is disabled and the packet 590 * is not locally generated and the packet is not subject to 591 * 'ipfw fwd'. 592 * 593 * XXX - Checking also should be disabled if the destination 594 * address is ipnat'ed to a different interface. 595 * 596 * XXX - Checking is incompatible with IP aliases added 597 * to the loopback interface instead of the interface where 598 * the packets are received. 599 * 600 * XXX - This is the case for carp vhost IPs as well so we 601 * insert a workaround. If the packet got here, we already 602 * checked with carp_iamatch() and carp_forus(). 603 */ 604 checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 605 ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && 606 #ifdef DEV_CARP 607 !ifp->if_carp && 608 #endif 609 (dchg == 0); 610 611 /* 612 * Check for exact addresses in the hash bucket. 613 */ 614 LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { 615 /* 616 * If the address matches, verify that the packet 617 * arrived via the correct interface if checking is 618 * enabled. 619 */ 620 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 621 (!checkif || ia->ia_ifp == ifp)) 622 goto ours; 623 } 624 /* 625 * Check for broadcast addresses. 626 * 627 * Only accept broadcast packets that arrive via the matching 628 * interface. Reception of forwarded directed broadcasts would 629 * be handled via ip_forward() and ether_output() with the loopback 630 * into the stack for SIMPLEX interfaces handled by ether_output(). 631 */ 632 if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { 633 IF_ADDR_LOCK(ifp); 634 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 635 if (ifa->ifa_addr->sa_family != AF_INET) 636 continue; 637 ia = ifatoia(ifa); 638 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == 639 ip->ip_dst.s_addr) { 640 IF_ADDR_UNLOCK(ifp); 641 goto ours; 642 } 643 if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) { 644 IF_ADDR_UNLOCK(ifp); 645 goto ours; 646 } 647 #ifdef BOOTP_COMPAT 648 if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { 649 IF_ADDR_UNLOCK(ifp); 650 goto ours; 651 } 652 #endif 653 } 654 IF_ADDR_UNLOCK(ifp); 655 } 656 /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ 657 if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { 658 IPSTAT_INC(ips_cantforward); 659 m_freem(m); 660 return; 661 } 662 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 663 if (V_ip_mrouter) { 664 /* 665 * If we are acting as a multicast router, all 666 * incoming multicast packets are passed to the 667 * kernel-level multicast forwarding function. 668 * The packet is returned (relatively) intact; if 669 * ip_mforward() returns a non-zero value, the packet 670 * must be discarded, else it may be accepted below. 671 */ 672 if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { 673 IPSTAT_INC(ips_cantforward); 674 m_freem(m); 675 return; 676 } 677 678 /* 679 * The process-level routing daemon needs to receive 680 * all multicast IGMP packets, whether or not this 681 * host belongs to their destination groups. 682 */ 683 if (ip->ip_p == IPPROTO_IGMP) 684 goto ours; 685 IPSTAT_INC(ips_forward); 686 } 687 /* 688 * Assume the packet is for us, to avoid prematurely taking 689 * a lock on the in_multi hash. Protocols must perform 690 * their own filtering and update statistics accordingly. 691 */ 692 goto ours; 693 } 694 if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) 695 goto ours; 696 if (ip->ip_dst.s_addr == INADDR_ANY) 697 goto ours; 698 699 /* 700 * FAITH(Firewall Aided Internet Translator) 701 */ 702 if (ifp && ifp->if_type == IFT_FAITH) { 703 if (V_ip_keepfaith) { 704 if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) 705 goto ours; 706 } 707 m_freem(m); 708 return; 709 } 710 711 /* 712 * Not for us; forward if possible and desirable. 713 */ 714 if (V_ipforwarding == 0) { 715 IPSTAT_INC(ips_cantforward); 716 m_freem(m); 717 } else { 718 #ifdef IPSEC 719 if (ip_ipsec_fwd(m)) 720 goto bad; 721 #endif /* IPSEC */ 722 ip_forward(m, dchg); 723 } 724 return; 725 726 ours: 727 #ifdef IPSTEALTH 728 /* 729 * IPSTEALTH: Process non-routing options only 730 * if the packet is destined for us. 731 */ 732 if (V_ipstealth && hlen > sizeof (struct ip) && 733 ip_dooptions(m, 1)) 734 return; 735 #endif /* IPSTEALTH */ 736 737 /* Count the packet in the ip address stats */ 738 if (ia != NULL) { 739 ia->ia_ifa.if_ipackets++; 740 ia->ia_ifa.if_ibytes += m->m_pkthdr.len; 741 } 742 743 /* 744 * Attempt reassembly; if it succeeds, proceed. 745 * ip_reass() will return a different mbuf. 746 */ 747 if (ip->ip_off & (IP_MF | IP_OFFMASK)) { 748 m = ip_reass(m); 749 if (m == NULL) 750 return; 751 ip = mtod(m, struct ip *); 752 /* Get the header length of the reassembled packet */ 753 hlen = ip->ip_hl << 2; 754 } 755 756 /* 757 * Further protocols expect the packet length to be w/o the 758 * IP header. 759 */ 760 ip->ip_len -= hlen; 761 762 #ifdef IPSEC 763 /* 764 * enforce IPsec policy checking if we are seeing last header. 765 * note that we do not visit this with protocols with pcb layer 766 * code - like udp/tcp/raw ip. 767 */ 768 if (ip_ipsec_input(m)) 769 goto bad; 770 #endif /* IPSEC */ 771 772 /* 773 * Switch out to protocol's input routine. 774 */ 775 IPSTAT_INC(ips_delivered); 776 777 (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); 778 return; 779 bad: 780 m_freem(m); 781 } 782 783 /* 784 * After maxnipq has been updated, propagate the change to UMA. The UMA zone 785 * max has slightly different semantics than the sysctl, for historical 786 * reasons. 787 */ 788 static void 789 maxnipq_update(void) 790 { 791 INIT_VNET_INET(curvnet); 792 793 /* 794 * -1 for unlimited allocation. 795 */ 796 if (V_maxnipq < 0) 797 uma_zone_set_max(V_ipq_zone, 0); 798 /* 799 * Positive number for specific bound. 800 */ 801 if (V_maxnipq > 0) 802 uma_zone_set_max(V_ipq_zone, V_maxnipq); 803 /* 804 * Zero specifies no further fragment queue allocation -- set the 805 * bound very low, but rely on implementation elsewhere to actually 806 * prevent allocation and reclaim current queues. 807 */ 808 if (V_maxnipq == 0) 809 uma_zone_set_max(V_ipq_zone, 1); 810 } 811 812 static void 813 ipq_zone_change(void *tag) 814 { 815 INIT_VNET_INET(curvnet); 816 817 if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { 818 V_maxnipq = nmbclusters / 32; 819 maxnipq_update(); 820 } 821 } 822 823 static int 824 sysctl_maxnipq(SYSCTL_HANDLER_ARGS) 825 { 826 INIT_VNET_INET(curvnet); 827 int error, i; 828 829 i = V_maxnipq; 830 error = sysctl_handle_int(oidp, &i, 0, req); 831 if (error || !req->newptr) 832 return (error); 833 834 /* 835 * XXXRW: Might be a good idea to sanity check the argument and place 836 * an extreme upper bound. 837 */ 838 if (i < -1) 839 return (EINVAL); 840 V_maxnipq = i; 841 maxnipq_update(); 842 return (0); 843 } 844 845 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, 846 NULL, 0, sysctl_maxnipq, "I", 847 "Maximum number of IPv4 fragment reassembly queue entries"); 848 849 /* 850 * Take incoming datagram fragment and try to reassemble it into 851 * whole datagram. If the argument is the first fragment or one 852 * in between the function will return NULL and store the mbuf 853 * in the fragment chain. If the argument is the last fragment 854 * the packet will be reassembled and the pointer to the new 855 * mbuf returned for further processing. Only m_tags attached 856 * to the first packet/fragment are preserved. 857 * The IP header is *NOT* adjusted out of iplen. 858 */ 859 struct mbuf * 860 ip_reass(struct mbuf *m) 861 { 862 INIT_VNET_INET(curvnet); 863 struct ip *ip; 864 struct mbuf *p, *q, *nq, *t; 865 struct ipq *fp = NULL; 866 struct ipqhead *head; 867 int i, hlen, next; 868 u_int8_t ecn, ecn0; 869 u_short hash; 870 871 /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ 872 if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { 873 IPSTAT_INC(ips_fragments); 874 IPSTAT_INC(ips_fragdropped); 875 m_freem(m); 876 return (NULL); 877 } 878 879 ip = mtod(m, struct ip *); 880 hlen = ip->ip_hl << 2; 881 882 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 883 head = &V_ipq[hash]; 884 IPQ_LOCK(); 885 886 /* 887 * Look for queue of fragments 888 * of this datagram. 889 */ 890 TAILQ_FOREACH(fp, head, ipq_list) 891 if (ip->ip_id == fp->ipq_id && 892 ip->ip_src.s_addr == fp->ipq_src.s_addr && 893 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 894 #ifdef MAC 895 mac_ipq_match(m, fp) && 896 #endif 897 ip->ip_p == fp->ipq_p) 898 goto found; 899 900 fp = NULL; 901 902 /* 903 * Attempt to trim the number of allocated fragment queues if it 904 * exceeds the administrative limit. 905 */ 906 if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { 907 /* 908 * drop something from the tail of the current queue 909 * before proceeding further 910 */ 911 struct ipq *q = TAILQ_LAST(head, ipqhead); 912 if (q == NULL) { /* gak */ 913 for (i = 0; i < IPREASS_NHASH; i++) { 914 struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); 915 if (r) { 916 IPSTAT_ADD(ips_fragtimeout, 917 r->ipq_nfrags); 918 ip_freef(&V_ipq[i], r); 919 break; 920 } 921 } 922 } else { 923 IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); 924 ip_freef(head, q); 925 } 926 } 927 928 found: 929 /* 930 * Adjust ip_len to not reflect header, 931 * convert offset of this to bytes. 932 */ 933 ip->ip_len -= hlen; 934 if (ip->ip_off & IP_MF) { 935 /* 936 * Make sure that fragments have a data length 937 * that's a non-zero multiple of 8 bytes. 938 */ 939 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { 940 IPSTAT_INC(ips_toosmall); /* XXX */ 941 goto dropfrag; 942 } 943 m->m_flags |= M_FRAG; 944 } else 945 m->m_flags &= ~M_FRAG; 946 ip->ip_off <<= 3; 947 948 949 /* 950 * Attempt reassembly; if it succeeds, proceed. 951 * ip_reass() will return a different mbuf. 952 */ 953 IPSTAT_INC(ips_fragments); 954 m->m_pkthdr.header = ip; 955 956 /* Previous ip_reass() started here. */ 957 /* 958 * Presence of header sizes in mbufs 959 * would confuse code below. 960 */ 961 m->m_data += hlen; 962 m->m_len -= hlen; 963 964 /* 965 * If first fragment to arrive, create a reassembly queue. 966 */ 967 if (fp == NULL) { 968 fp = uma_zalloc(V_ipq_zone, M_NOWAIT); 969 if (fp == NULL) 970 goto dropfrag; 971 #ifdef MAC 972 if (mac_ipq_init(fp, M_NOWAIT) != 0) { 973 uma_zfree(V_ipq_zone, fp); 974 fp = NULL; 975 goto dropfrag; 976 } 977 mac_ipq_create(m, fp); 978 #endif 979 TAILQ_INSERT_HEAD(head, fp, ipq_list); 980 V_nipq++; 981 fp->ipq_nfrags = 1; 982 fp->ipq_ttl = IPFRAGTTL; 983 fp->ipq_p = ip->ip_p; 984 fp->ipq_id = ip->ip_id; 985 fp->ipq_src = ip->ip_src; 986 fp->ipq_dst = ip->ip_dst; 987 fp->ipq_frags = m; 988 m->m_nextpkt = NULL; 989 goto done; 990 } else { 991 fp->ipq_nfrags++; 992 #ifdef MAC 993 mac_ipq_update(m, fp); 994 #endif 995 } 996 997 #define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) 998 999 /* 1000 * Handle ECN by comparing this segment with the first one; 1001 * if CE is set, do not lose CE. 1002 * drop if CE and not-ECT are mixed for the same packet. 1003 */ 1004 ecn = ip->ip_tos & IPTOS_ECN_MASK; 1005 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 1006 if (ecn == IPTOS_ECN_CE) { 1007 if (ecn0 == IPTOS_ECN_NOTECT) 1008 goto dropfrag; 1009 if (ecn0 != IPTOS_ECN_CE) 1010 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 1011 } 1012 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 1013 goto dropfrag; 1014 1015 /* 1016 * Find a segment which begins after this one does. 1017 */ 1018 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 1019 if (GETIP(q)->ip_off > ip->ip_off) 1020 break; 1021 1022 /* 1023 * If there is a preceding segment, it may provide some of 1024 * our data already. If so, drop the data from the incoming 1025 * segment. If it provides all of our data, drop us, otherwise 1026 * stick new segment in the proper place. 1027 * 1028 * If some of the data is dropped from the the preceding 1029 * segment, then it's checksum is invalidated. 1030 */ 1031 if (p) { 1032 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; 1033 if (i > 0) { 1034 if (i >= ip->ip_len) 1035 goto dropfrag; 1036 m_adj(m, i); 1037 m->m_pkthdr.csum_flags = 0; 1038 ip->ip_off += i; 1039 ip->ip_len -= i; 1040 } 1041 m->m_nextpkt = p->m_nextpkt; 1042 p->m_nextpkt = m; 1043 } else { 1044 m->m_nextpkt = fp->ipq_frags; 1045 fp->ipq_frags = m; 1046 } 1047 1048 /* 1049 * While we overlap succeeding segments trim them or, 1050 * if they are completely covered, dequeue them. 1051 */ 1052 for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; 1053 q = nq) { 1054 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; 1055 if (i < GETIP(q)->ip_len) { 1056 GETIP(q)->ip_len -= i; 1057 GETIP(q)->ip_off += i; 1058 m_adj(q, i); 1059 q->m_pkthdr.csum_flags = 0; 1060 break; 1061 } 1062 nq = q->m_nextpkt; 1063 m->m_nextpkt = nq; 1064 IPSTAT_INC(ips_fragdropped); 1065 fp->ipq_nfrags--; 1066 m_freem(q); 1067 } 1068 1069 /* 1070 * Check for complete reassembly and perform frag per packet 1071 * limiting. 1072 * 1073 * Frag limiting is performed here so that the nth frag has 1074 * a chance to complete the packet before we drop the packet. 1075 * As a result, n+1 frags are actually allowed per packet, but 1076 * only n will ever be stored. (n = maxfragsperpacket.) 1077 * 1078 */ 1079 next = 0; 1080 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 1081 if (GETIP(q)->ip_off != next) { 1082 if (fp->ipq_nfrags > V_maxfragsperpacket) { 1083 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 1084 ip_freef(head, fp); 1085 } 1086 goto done; 1087 } 1088 next += GETIP(q)->ip_len; 1089 } 1090 /* Make sure the last packet didn't have the IP_MF flag */ 1091 if (p->m_flags & M_FRAG) { 1092 if (fp->ipq_nfrags > V_maxfragsperpacket) { 1093 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 1094 ip_freef(head, fp); 1095 } 1096 goto done; 1097 } 1098 1099 /* 1100 * Reassembly is complete. Make sure the packet is a sane size. 1101 */ 1102 q = fp->ipq_frags; 1103 ip = GETIP(q); 1104 if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { 1105 IPSTAT_INC(ips_toolong); 1106 IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 1107 ip_freef(head, fp); 1108 goto done; 1109 } 1110 1111 /* 1112 * Concatenate fragments. 1113 */ 1114 m = q; 1115 t = m->m_next; 1116 m->m_next = NULL; 1117 m_cat(m, t); 1118 nq = q->m_nextpkt; 1119 q->m_nextpkt = NULL; 1120 for (q = nq; q != NULL; q = nq) { 1121 nq = q->m_nextpkt; 1122 q->m_nextpkt = NULL; 1123 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; 1124 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; 1125 m_cat(m, q); 1126 } 1127 /* 1128 * In order to do checksumming faster we do 'end-around carry' here 1129 * (and not in for{} loop), though it implies we are not going to 1130 * reassemble more than 64k fragments. 1131 */ 1132 m->m_pkthdr.csum_data = 1133 (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); 1134 #ifdef MAC 1135 mac_ipq_reassemble(fp, m); 1136 mac_ipq_destroy(fp); 1137 #endif 1138 1139 /* 1140 * Create header for new ip packet by modifying header of first 1141 * packet; dequeue and discard fragment reassembly header. 1142 * Make header visible. 1143 */ 1144 ip->ip_len = (ip->ip_hl << 2) + next; 1145 ip->ip_src = fp->ipq_src; 1146 ip->ip_dst = fp->ipq_dst; 1147 TAILQ_REMOVE(head, fp, ipq_list); 1148 V_nipq--; 1149 uma_zfree(V_ipq_zone, fp); 1150 m->m_len += (ip->ip_hl << 2); 1151 m->m_data -= (ip->ip_hl << 2); 1152 /* some debugging cruft by sklower, below, will go away soon */ 1153 if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ 1154 m_fixhdr(m); 1155 IPSTAT_INC(ips_reassembled); 1156 IPQ_UNLOCK(); 1157 return (m); 1158 1159 dropfrag: 1160 IPSTAT_INC(ips_fragdropped); 1161 if (fp != NULL) 1162 fp->ipq_nfrags--; 1163 m_freem(m); 1164 done: 1165 IPQ_UNLOCK(); 1166 return (NULL); 1167 1168 #undef GETIP 1169 } 1170 1171 /* 1172 * Free a fragment reassembly header and all 1173 * associated datagrams. 1174 */ 1175 static void 1176 ip_freef(struct ipqhead *fhp, struct ipq *fp) 1177 { 1178 INIT_VNET_INET(curvnet); 1179 struct mbuf *q; 1180 1181 IPQ_LOCK_ASSERT(); 1182 1183 while (fp->ipq_frags) { 1184 q = fp->ipq_frags; 1185 fp->ipq_frags = q->m_nextpkt; 1186 m_freem(q); 1187 } 1188 TAILQ_REMOVE(fhp, fp, ipq_list); 1189 uma_zfree(V_ipq_zone, fp); 1190 V_nipq--; 1191 } 1192 1193 /* 1194 * IP timer processing; 1195 * if a timer expires on a reassembly 1196 * queue, discard it. 1197 */ 1198 void 1199 ip_slowtimo(void) 1200 { 1201 VNET_ITERATOR_DECL(vnet_iter); 1202 struct ipq *fp; 1203 int i; 1204 1205 IPQ_LOCK(); 1206 VNET_LIST_RLOCK(); 1207 VNET_FOREACH(vnet_iter) { 1208 CURVNET_SET(vnet_iter); 1209 INIT_VNET_INET(vnet_iter); 1210 for (i = 0; i < IPREASS_NHASH; i++) { 1211 for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { 1212 struct ipq *fpp; 1213 1214 fpp = fp; 1215 fp = TAILQ_NEXT(fp, ipq_list); 1216 if(--fpp->ipq_ttl == 0) { 1217 IPSTAT_ADD(ips_fragtimeout, 1218 fpp->ipq_nfrags); 1219 ip_freef(&V_ipq[i], fpp); 1220 } 1221 } 1222 } 1223 /* 1224 * If we are over the maximum number of fragments 1225 * (due to the limit being lowered), drain off 1226 * enough to get down to the new limit. 1227 */ 1228 if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { 1229 for (i = 0; i < IPREASS_NHASH; i++) { 1230 while (V_nipq > V_maxnipq && 1231 !TAILQ_EMPTY(&V_ipq[i])) { 1232 IPSTAT_ADD(ips_fragdropped, 1233 TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); 1234 ip_freef(&V_ipq[i], 1235 TAILQ_FIRST(&V_ipq[i])); 1236 } 1237 } 1238 } 1239 CURVNET_RESTORE(); 1240 } 1241 VNET_LIST_RUNLOCK(); 1242 IPQ_UNLOCK(); 1243 } 1244 1245 /* 1246 * Drain off all datagram fragments. 1247 */ 1248 void 1249 ip_drain(void) 1250 { 1251 VNET_ITERATOR_DECL(vnet_iter); 1252 int i; 1253 1254 IPQ_LOCK(); 1255 VNET_LIST_RLOCK(); 1256 VNET_FOREACH(vnet_iter) { 1257 CURVNET_SET(vnet_iter); 1258 INIT_VNET_INET(vnet_iter); 1259 for (i = 0; i < IPREASS_NHASH; i++) { 1260 while(!TAILQ_EMPTY(&V_ipq[i])) { 1261 IPSTAT_ADD(ips_fragdropped, 1262 TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); 1263 ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); 1264 } 1265 } 1266 CURVNET_RESTORE(); 1267 } 1268 VNET_LIST_RUNLOCK(); 1269 IPQ_UNLOCK(); 1270 in_rtqdrain(); 1271 } 1272 1273 /* 1274 * The protocol to be inserted into ip_protox[] must be already registered 1275 * in inetsw[], either statically or through pf_proto_register(). 1276 */ 1277 int 1278 ipproto_register(u_char ipproto) 1279 { 1280 struct protosw *pr; 1281 1282 /* Sanity checks. */ 1283 if (ipproto == 0) 1284 return (EPROTONOSUPPORT); 1285 1286 /* 1287 * The protocol slot must not be occupied by another protocol 1288 * already. An index pointing to IPPROTO_RAW is unused. 1289 */ 1290 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 1291 if (pr == NULL) 1292 return (EPFNOSUPPORT); 1293 if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ 1294 return (EEXIST); 1295 1296 /* Find the protocol position in inetsw[] and set the index. */ 1297 for (pr = inetdomain.dom_protosw; 1298 pr < inetdomain.dom_protoswNPROTOSW; pr++) { 1299 if (pr->pr_domain->dom_family == PF_INET && 1300 pr->pr_protocol && pr->pr_protocol == ipproto) { 1301 /* Be careful to only index valid IP protocols. */ 1302 if (pr->pr_protocol < IPPROTO_MAX) { 1303 ip_protox[pr->pr_protocol] = pr - inetsw; 1304 return (0); 1305 } else 1306 return (EINVAL); 1307 } 1308 } 1309 return (EPROTONOSUPPORT); 1310 } 1311 1312 int 1313 ipproto_unregister(u_char ipproto) 1314 { 1315 struct protosw *pr; 1316 1317 /* Sanity checks. */ 1318 if (ipproto == 0) 1319 return (EPROTONOSUPPORT); 1320 1321 /* Check if the protocol was indeed registered. */ 1322 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 1323 if (pr == NULL) 1324 return (EPFNOSUPPORT); 1325 if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ 1326 return (ENOENT); 1327 1328 /* Reset the protocol slot to IPPROTO_RAW. */ 1329 ip_protox[ipproto] = pr - inetsw; 1330 return (0); 1331 } 1332 1333 /* 1334 * Given address of next destination (final or next hop), 1335 * return internet address info of interface to be used to get there. 1336 */ 1337 struct in_ifaddr * 1338 ip_rtaddr(struct in_addr dst, u_int fibnum) 1339 { 1340 struct route sro; 1341 struct sockaddr_in *sin; 1342 struct in_ifaddr *ifa; 1343 1344 bzero(&sro, sizeof(sro)); 1345 sin = (struct sockaddr_in *)&sro.ro_dst; 1346 sin->sin_family = AF_INET; 1347 sin->sin_len = sizeof(*sin); 1348 sin->sin_addr = dst; 1349 in_rtalloc_ign(&sro, 0, fibnum); 1350 1351 if (sro.ro_rt == NULL) 1352 return (NULL); 1353 1354 ifa = ifatoia(sro.ro_rt->rt_ifa); 1355 RTFREE(sro.ro_rt); 1356 return (ifa); 1357 } 1358 1359 u_char inetctlerrmap[PRC_NCMDS] = { 1360 0, 0, 0, 0, 1361 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1362 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1363 EMSGSIZE, EHOSTUNREACH, 0, 0, 1364 0, 0, EHOSTUNREACH, 0, 1365 ENOPROTOOPT, ECONNREFUSED 1366 }; 1367 1368 /* 1369 * Forward a packet. If some error occurs return the sender 1370 * an icmp packet. Note we can't always generate a meaningful 1371 * icmp message because icmp doesn't have a large enough repertoire 1372 * of codes and types. 1373 * 1374 * If not forwarding, just drop the packet. This could be confusing 1375 * if ipforwarding was zero but some routing protocol was advancing 1376 * us as a gateway to somewhere. However, we must let the routing 1377 * protocol deal with that. 1378 * 1379 * The srcrt parameter indicates whether the packet is being forwarded 1380 * via a source route. 1381 */ 1382 void 1383 ip_forward(struct mbuf *m, int srcrt) 1384 { 1385 INIT_VNET_INET(curvnet); 1386 struct ip *ip = mtod(m, struct ip *); 1387 struct in_ifaddr *ia; 1388 struct mbuf *mcopy; 1389 struct in_addr dest; 1390 struct route ro; 1391 int error, type = 0, code = 0, mtu = 0; 1392 1393 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1394 IPSTAT_INC(ips_cantforward); 1395 m_freem(m); 1396 return; 1397 } 1398 #ifdef IPSTEALTH 1399 if (!V_ipstealth) { 1400 #endif 1401 if (ip->ip_ttl <= IPTTLDEC) { 1402 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 1403 0, 0); 1404 return; 1405 } 1406 #ifdef IPSTEALTH 1407 } 1408 #endif 1409 1410 ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); 1411 #ifndef IPSEC 1412 /* 1413 * 'ia' may be NULL if there is no route for this destination. 1414 * In case of IPsec, Don't discard it just yet, but pass it to 1415 * ip_output in case of outgoing IPsec policy. 1416 */ 1417 if (!srcrt && ia == NULL) { 1418 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); 1419 return; 1420 } 1421 #endif 1422 1423 /* 1424 * Save the IP header and at most 8 bytes of the payload, 1425 * in case we need to generate an ICMP message to the src. 1426 * 1427 * XXX this can be optimized a lot by saving the data in a local 1428 * buffer on the stack (72 bytes at most), and only allocating the 1429 * mbuf if really necessary. The vast majority of the packets 1430 * are forwarded without having to send an ICMP back (either 1431 * because unnecessary, or because rate limited), so we are 1432 * really we are wasting a lot of work here. 1433 * 1434 * We don't use m_copy() because it might return a reference 1435 * to a shared cluster. Both this function and ip_output() 1436 * assume exclusive access to the IP header in `m', so any 1437 * data in a cluster may change before we reach icmp_error(). 1438 */ 1439 MGETHDR(mcopy, M_DONTWAIT, m->m_type); 1440 if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { 1441 /* 1442 * It's probably ok if the pkthdr dup fails (because 1443 * the deep copy of the tag chain failed), but for now 1444 * be conservative and just discard the copy since 1445 * code below may some day want the tags. 1446 */ 1447 m_free(mcopy); 1448 mcopy = NULL; 1449 } 1450 if (mcopy != NULL) { 1451 mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); 1452 mcopy->m_pkthdr.len = mcopy->m_len; 1453 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); 1454 } 1455 1456 #ifdef IPSTEALTH 1457 if (!V_ipstealth) { 1458 #endif 1459 ip->ip_ttl -= IPTTLDEC; 1460 #ifdef IPSTEALTH 1461 } 1462 #endif 1463 1464 /* 1465 * If forwarding packet using same interface that it came in on, 1466 * perhaps should send a redirect to sender to shortcut a hop. 1467 * Only send redirect if source is sending directly to us, 1468 * and if packet was not source routed (or has any options). 1469 * Also, don't send redirect if forwarding using a default route 1470 * or a route modified by a redirect. 1471 */ 1472 dest.s_addr = 0; 1473 if (!srcrt && V_ipsendredirects && 1474 ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { 1475 struct sockaddr_in *sin; 1476 struct rtentry *rt; 1477 1478 bzero(&ro, sizeof(ro)); 1479 sin = (struct sockaddr_in *)&ro.ro_dst; 1480 sin->sin_family = AF_INET; 1481 sin->sin_len = sizeof(*sin); 1482 sin->sin_addr = ip->ip_dst; 1483 in_rtalloc_ign(&ro, 0, M_GETFIB(m)); 1484 1485 rt = ro.ro_rt; 1486 1487 if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1488 satosin(rt_key(rt))->sin_addr.s_addr != 0) { 1489 #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) 1490 u_long src = ntohl(ip->ip_src.s_addr); 1491 1492 if (RTA(rt) && 1493 (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { 1494 if (rt->rt_flags & RTF_GATEWAY) 1495 dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; 1496 else 1497 dest.s_addr = ip->ip_dst.s_addr; 1498 /* Router requirements says to only send host redirects */ 1499 type = ICMP_REDIRECT; 1500 code = ICMP_REDIRECT_HOST; 1501 } 1502 } 1503 if (rt) 1504 RTFREE(rt); 1505 } 1506 1507 /* 1508 * Try to cache the route MTU from ip_output so we can consider it for 1509 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. 1510 */ 1511 bzero(&ro, sizeof(ro)); 1512 1513 error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); 1514 1515 if (error == EMSGSIZE && ro.ro_rt) 1516 mtu = ro.ro_rt->rt_rmx.rmx_mtu; 1517 if (ro.ro_rt) 1518 RTFREE(ro.ro_rt); 1519 1520 if (error) 1521 IPSTAT_INC(ips_cantforward); 1522 else { 1523 IPSTAT_INC(ips_forward); 1524 if (type) 1525 IPSTAT_INC(ips_redirectsent); 1526 else { 1527 if (mcopy) 1528 m_freem(mcopy); 1529 return; 1530 } 1531 } 1532 if (mcopy == NULL) 1533 return; 1534 1535 switch (error) { 1536 1537 case 0: /* forwarded, but need redirect */ 1538 /* type, code set above */ 1539 break; 1540 1541 case ENETUNREACH: 1542 case EHOSTUNREACH: 1543 case ENETDOWN: 1544 case EHOSTDOWN: 1545 default: 1546 type = ICMP_UNREACH; 1547 code = ICMP_UNREACH_HOST; 1548 break; 1549 1550 case EMSGSIZE: 1551 type = ICMP_UNREACH; 1552 code = ICMP_UNREACH_NEEDFRAG; 1553 1554 #ifdef IPSEC 1555 /* 1556 * If IPsec is configured for this path, 1557 * override any possibly mtu value set by ip_output. 1558 */ 1559 mtu = ip_ipsec_mtu(m, mtu); 1560 #endif /* IPSEC */ 1561 /* 1562 * If the MTU was set before make sure we are below the 1563 * interface MTU. 1564 * If the MTU wasn't set before use the interface mtu or 1565 * fall back to the next smaller mtu step compared to the 1566 * current packet size. 1567 */ 1568 if (mtu != 0) { 1569 if (ia != NULL) 1570 mtu = min(mtu, ia->ia_ifp->if_mtu); 1571 } else { 1572 if (ia != NULL) 1573 mtu = ia->ia_ifp->if_mtu; 1574 else 1575 mtu = ip_next_mtu(ip->ip_len, 0); 1576 } 1577 IPSTAT_INC(ips_cantfrag); 1578 break; 1579 1580 case ENOBUFS: 1581 /* 1582 * A router should not generate ICMP_SOURCEQUENCH as 1583 * required in RFC1812 Requirements for IP Version 4 Routers. 1584 * Source quench could be a big problem under DoS attacks, 1585 * or if the underlying interface is rate-limited. 1586 * Those who need source quench packets may re-enable them 1587 * via the net.inet.ip.sendsourcequench sysctl. 1588 */ 1589 if (V_ip_sendsourcequench == 0) { 1590 m_freem(mcopy); 1591 return; 1592 } else { 1593 type = ICMP_SOURCEQUENCH; 1594 code = 0; 1595 } 1596 break; 1597 1598 case EACCES: /* ipfw denied packet */ 1599 m_freem(mcopy); 1600 return; 1601 } 1602 icmp_error(mcopy, type, code, dest.s_addr, mtu); 1603 } 1604 1605 void 1606 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1607 struct mbuf *m) 1608 { 1609 INIT_VNET_NET(inp->inp_vnet); 1610 1611 if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { 1612 struct bintime bt; 1613 1614 bintime(&bt); 1615 if (inp->inp_socket->so_options & SO_BINTIME) { 1616 *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), 1617 SCM_BINTIME, SOL_SOCKET); 1618 if (*mp) 1619 mp = &(*mp)->m_next; 1620 } 1621 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1622 struct timeval tv; 1623 1624 bintime2timeval(&bt, &tv); 1625 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1626 SCM_TIMESTAMP, SOL_SOCKET); 1627 if (*mp) 1628 mp = &(*mp)->m_next; 1629 } 1630 } 1631 if (inp->inp_flags & INP_RECVDSTADDR) { 1632 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1633 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1634 if (*mp) 1635 mp = &(*mp)->m_next; 1636 } 1637 if (inp->inp_flags & INP_RECVTTL) { 1638 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1639 sizeof(u_char), IP_RECVTTL, IPPROTO_IP); 1640 if (*mp) 1641 mp = &(*mp)->m_next; 1642 } 1643 #ifdef notyet 1644 /* XXX 1645 * Moving these out of udp_input() made them even more broken 1646 * than they already were. 1647 */ 1648 /* options were tossed already */ 1649 if (inp->inp_flags & INP_RECVOPTS) { 1650 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1651 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1652 if (*mp) 1653 mp = &(*mp)->m_next; 1654 } 1655 /* ip_srcroute doesn't do what we want here, need to fix */ 1656 if (inp->inp_flags & INP_RECVRETOPTS) { 1657 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1658 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1659 if (*mp) 1660 mp = &(*mp)->m_next; 1661 } 1662 #endif 1663 if (inp->inp_flags & INP_RECVIF) { 1664 struct ifnet *ifp; 1665 struct sdlbuf { 1666 struct sockaddr_dl sdl; 1667 u_char pad[32]; 1668 } sdlbuf; 1669 struct sockaddr_dl *sdp; 1670 struct sockaddr_dl *sdl2 = &sdlbuf.sdl; 1671 1672 if (((ifp = m->m_pkthdr.rcvif)) 1673 && ( ifp->if_index && (ifp->if_index <= V_if_index))) { 1674 sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; 1675 /* 1676 * Change our mind and don't try copy. 1677 */ 1678 if ((sdp->sdl_family != AF_LINK) 1679 || (sdp->sdl_len > sizeof(sdlbuf))) { 1680 goto makedummy; 1681 } 1682 bcopy(sdp, sdl2, sdp->sdl_len); 1683 } else { 1684 makedummy: 1685 sdl2->sdl_len 1686 = offsetof(struct sockaddr_dl, sdl_data[0]); 1687 sdl2->sdl_family = AF_LINK; 1688 sdl2->sdl_index = 0; 1689 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; 1690 } 1691 *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, 1692 IP_RECVIF, IPPROTO_IP); 1693 if (*mp) 1694 mp = &(*mp)->m_next; 1695 } 1696 } 1697 1698 /* 1699 * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the 1700 * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on 1701 * locking. This code remains in ip_input.c as ip_mroute.c is optionally 1702 * compiled. 1703 */ 1704 int 1705 ip_rsvp_init(struct socket *so) 1706 { 1707 INIT_VNET_INET(so->so_vnet); 1708 1709 if (so->so_type != SOCK_RAW || 1710 so->so_proto->pr_protocol != IPPROTO_RSVP) 1711 return EOPNOTSUPP; 1712 1713 if (V_ip_rsvpd != NULL) 1714 return EADDRINUSE; 1715 1716 V_ip_rsvpd = so; 1717 /* 1718 * This may seem silly, but we need to be sure we don't over-increment 1719 * the RSVP counter, in case something slips up. 1720 */ 1721 if (!V_ip_rsvp_on) { 1722 V_ip_rsvp_on = 1; 1723 V_rsvp_on++; 1724 } 1725 1726 return 0; 1727 } 1728 1729 int 1730 ip_rsvp_done(void) 1731 { 1732 INIT_VNET_INET(curvnet); 1733 1734 V_ip_rsvpd = NULL; 1735 /* 1736 * This may seem silly, but we need to be sure we don't over-decrement 1737 * the RSVP counter, in case something slips up. 1738 */ 1739 if (V_ip_rsvp_on) { 1740 V_ip_rsvp_on = 0; 1741 V_rsvp_on--; 1742 } 1743 return 0; 1744 } 1745 1746 void 1747 rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ 1748 { 1749 INIT_VNET_INET(curvnet); 1750 1751 if (rsvp_input_p) { /* call the real one if loaded */ 1752 rsvp_input_p(m, off); 1753 return; 1754 } 1755 1756 /* Can still get packets with rsvp_on = 0 if there is a local member 1757 * of the group to which the RSVP packet is addressed. But in this 1758 * case we want to throw the packet away. 1759 */ 1760 1761 if (!V_rsvp_on) { 1762 m_freem(m); 1763 return; 1764 } 1765 1766 if (V_ip_rsvpd != NULL) { 1767 rip_input(m, off); 1768 return; 1769 } 1770 /* Drop the packet */ 1771 m_freem(m); 1772 } 1773