1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_bootp.h" 38 #include "opt_ipstealth.h" 39 #include "opt_ipsec.h" 40 #include "opt_route.h" 41 #include "opt_rss.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/hhook.h> 46 #include <sys/mbuf.h> 47 #include <sys/malloc.h> 48 #include <sys/domain.h> 49 #include <sys/protosw.h> 50 #include <sys/socket.h> 51 #include <sys/time.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/rmlock.h> 55 #include <sys/rwlock.h> 56 #include <sys/sdt.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 60 #include <net/if.h> 61 #include <net/if_types.h> 62 #include <net/if_var.h> 63 #include <net/if_dl.h> 64 #include <net/pfil.h> 65 #include <net/route.h> 66 #include <net/netisr.h> 67 #include <net/rss_config.h> 68 #include <net/vnet.h> 69 70 #include <netinet/in.h> 71 #include <netinet/in_kdtrace.h> 72 #include <netinet/in_systm.h> 73 #include <netinet/in_var.h> 74 #include <netinet/ip.h> 75 #include <netinet/in_pcb.h> 76 #include <netinet/ip_var.h> 77 #include <netinet/ip_fw.h> 78 #include <netinet/ip_icmp.h> 79 #include <netinet/ip_options.h> 80 #include <machine/in_cksum.h> 81 #include <netinet/ip_carp.h> 82 #include <netinet/in_rss.h> 83 84 #include <netipsec/ipsec_support.h> 85 86 #include <sys/socketvar.h> 87 88 #include <security/mac/mac_framework.h> 89 90 #ifdef CTASSERT 91 CTASSERT(sizeof(struct ip) == 20); 92 #endif 93 94 /* IP reassembly functions are defined in ip_reass.c. */ 95 extern void ipreass_init(void); 96 extern void ipreass_drain(void); 97 extern void ipreass_slowtimo(void); 98 #ifdef VIMAGE 99 extern void ipreass_destroy(void); 100 #endif 101 102 struct rmlock in_ifaddr_lock; 103 RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); 104 105 VNET_DEFINE(int, rsvp_on); 106 107 VNET_DEFINE(int, ipforwarding); 108 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, 109 &VNET_NAME(ipforwarding), 0, 110 "Enable IP forwarding between interfaces"); 111 112 VNET_DEFINE_STATIC(int, ipsendredirects) = 1; /* XXX */ 113 #define V_ipsendredirects VNET(ipsendredirects) 114 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, 115 &VNET_NAME(ipsendredirects), 0, 116 "Enable sending IP redirects"); 117 118 /* 119 * XXX - Setting ip_checkinterface mostly implements the receive side of 120 * the Strong ES model described in RFC 1122, but since the routing table 121 * and transmit implementation do not implement the Strong ES model, 122 * setting this to 1 results in an odd hybrid. 123 * 124 * XXX - ip_checkinterface currently must be disabled if you use ipnat 125 * to translate the destination address to another local interface. 126 * 127 * XXX - ip_checkinterface must be disabled if you add IP aliases 128 * to the loopback interface instead of the interface where the 129 * packets for those addresses are received. 130 */ 131 VNET_DEFINE_STATIC(int, ip_checkinterface); 132 #define V_ip_checkinterface VNET(ip_checkinterface) 133 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, 134 &VNET_NAME(ip_checkinterface), 0, 135 "Verify packet arrives on correct interface"); 136 137 VNET_DEFINE(pfil_head_t, inet_pfil_head); /* Packet filter hooks */ 138 139 static struct netisr_handler ip_nh = { 140 .nh_name = "ip", 141 .nh_handler = ip_input, 142 .nh_proto = NETISR_IP, 143 #ifdef RSS 144 .nh_m2cpuid = rss_soft_m2cpuid_v4, 145 .nh_policy = NETISR_POLICY_CPU, 146 .nh_dispatch = NETISR_DISPATCH_HYBRID, 147 #else 148 .nh_policy = NETISR_POLICY_FLOW, 149 #endif 150 }; 151 152 #ifdef RSS 153 /* 154 * Directly dispatched frames are currently assumed 155 * to have a flowid already calculated. 156 * 157 * It should likely have something that assert it 158 * actually has valid flow details. 159 */ 160 static struct netisr_handler ip_direct_nh = { 161 .nh_name = "ip_direct", 162 .nh_handler = ip_direct_input, 163 .nh_proto = NETISR_IP_DIRECT, 164 .nh_m2cpuid = rss_soft_m2cpuid_v4, 165 .nh_policy = NETISR_POLICY_CPU, 166 .nh_dispatch = NETISR_DISPATCH_HYBRID, 167 }; 168 #endif 169 170 extern struct domain inetdomain; 171 extern struct protosw inetsw[]; 172 u_char ip_protox[IPPROTO_MAX]; 173 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ 174 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ 175 VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ 176 177 #ifdef IPCTL_DEFMTU 178 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, 179 &ip_mtu, 0, "Default MTU"); 180 #endif 181 182 #ifdef IPSTEALTH 183 VNET_DEFINE(int, ipstealth); 184 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, 185 &VNET_NAME(ipstealth), 0, 186 "IP stealth mode, no TTL decrementation on forwarding"); 187 #endif 188 189 /* 190 * IP statistics are stored in the "array" of counter(9)s. 191 */ 192 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); 193 VNET_PCPUSTAT_SYSINIT(ipstat); 194 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, 195 "IP statistics (struct ipstat, netinet/ip_var.h)"); 196 197 #ifdef VIMAGE 198 VNET_PCPUSTAT_SYSUNINIT(ipstat); 199 #endif /* VIMAGE */ 200 201 /* 202 * Kernel module interface for updating ipstat. The argument is an index 203 * into ipstat treated as an array. 204 */ 205 void 206 kmod_ipstat_inc(int statnum) 207 { 208 209 counter_u64_add(VNET(ipstat)[statnum], 1); 210 } 211 212 void 213 kmod_ipstat_dec(int statnum) 214 { 215 216 counter_u64_add(VNET(ipstat)[statnum], -1); 217 } 218 219 static int 220 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) 221 { 222 int error, qlimit; 223 224 netisr_getqlimit(&ip_nh, &qlimit); 225 error = sysctl_handle_int(oidp, &qlimit, 0, req); 226 if (error || !req->newptr) 227 return (error); 228 if (qlimit < 1) 229 return (EINVAL); 230 return (netisr_setqlimit(&ip_nh, qlimit)); 231 } 232 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, 233 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", 234 "Maximum size of the IP input queue"); 235 236 static int 237 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) 238 { 239 u_int64_t qdrops_long; 240 int error, qdrops; 241 242 netisr_getqdrops(&ip_nh, &qdrops_long); 243 qdrops = qdrops_long; 244 error = sysctl_handle_int(oidp, &qdrops, 0, req); 245 if (error || !req->newptr) 246 return (error); 247 if (qdrops != 0) 248 return (EINVAL); 249 netisr_clearqdrops(&ip_nh); 250 return (0); 251 } 252 253 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, 254 CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", 255 "Number of packets dropped from the IP input queue"); 256 257 #ifdef RSS 258 static int 259 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) 260 { 261 int error, qlimit; 262 263 netisr_getqlimit(&ip_direct_nh, &qlimit); 264 error = sysctl_handle_int(oidp, &qlimit, 0, req); 265 if (error || !req->newptr) 266 return (error); 267 if (qlimit < 1) 268 return (EINVAL); 269 return (netisr_setqlimit(&ip_direct_nh, qlimit)); 270 } 271 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen, 272 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, 273 "I", "Maximum size of the IP direct input queue"); 274 275 static int 276 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) 277 { 278 u_int64_t qdrops_long; 279 int error, qdrops; 280 281 netisr_getqdrops(&ip_direct_nh, &qdrops_long); 282 qdrops = qdrops_long; 283 error = sysctl_handle_int(oidp, &qdrops, 0, req); 284 if (error || !req->newptr) 285 return (error); 286 if (qdrops != 0) 287 return (EINVAL); 288 netisr_clearqdrops(&ip_direct_nh); 289 return (0); 290 } 291 292 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, 293 CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", 294 "Number of packets dropped from the IP direct input queue"); 295 #endif /* RSS */ 296 297 /* 298 * IP initialization: fill in IP protocol switch table. 299 * All protocols not implemented in kernel go to raw IP protocol handler. 300 */ 301 void 302 ip_init(void) 303 { 304 struct pfil_head_args args; 305 struct protosw *pr; 306 int i; 307 308 CK_STAILQ_INIT(&V_in_ifaddrhead); 309 V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); 310 311 /* Initialize IP reassembly queue. */ 312 ipreass_init(); 313 314 /* Initialize packet filter hooks. */ 315 args.pa_version = PFIL_VERSION; 316 args.pa_flags = PFIL_IN | PFIL_OUT; 317 args.pa_type = PFIL_TYPE_IP4; 318 args.pa_headname = PFIL_INET_NAME; 319 V_inet_pfil_head = pfil_head_register(&args); 320 321 if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, 322 &V_ipsec_hhh_in[HHOOK_IPSEC_INET], 323 HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) 324 printf("%s: WARNING: unable to register input helper hook\n", 325 __func__); 326 if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, 327 &V_ipsec_hhh_out[HHOOK_IPSEC_INET], 328 HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) 329 printf("%s: WARNING: unable to register output helper hook\n", 330 __func__); 331 332 /* Skip initialization of globals for non-default instances. */ 333 #ifdef VIMAGE 334 if (!IS_DEFAULT_VNET(curvnet)) { 335 netisr_register_vnet(&ip_nh); 336 #ifdef RSS 337 netisr_register_vnet(&ip_direct_nh); 338 #endif 339 return; 340 } 341 #endif 342 343 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 344 if (pr == NULL) 345 panic("ip_init: PF_INET not found"); 346 347 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ 348 for (i = 0; i < IPPROTO_MAX; i++) 349 ip_protox[i] = pr - inetsw; 350 /* 351 * Cycle through IP protocols and put them into the appropriate place 352 * in ip_protox[]. 353 */ 354 for (pr = inetdomain.dom_protosw; 355 pr < inetdomain.dom_protoswNPROTOSW; pr++) 356 if (pr->pr_domain->dom_family == PF_INET && 357 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { 358 /* Be careful to only index valid IP protocols. */ 359 if (pr->pr_protocol < IPPROTO_MAX) 360 ip_protox[pr->pr_protocol] = pr - inetsw; 361 } 362 363 netisr_register(&ip_nh); 364 #ifdef RSS 365 netisr_register(&ip_direct_nh); 366 #endif 367 } 368 369 #ifdef VIMAGE 370 static void 371 ip_destroy(void *unused __unused) 372 { 373 struct ifnet *ifp; 374 int error; 375 376 #ifdef RSS 377 netisr_unregister_vnet(&ip_direct_nh); 378 #endif 379 netisr_unregister_vnet(&ip_nh); 380 381 pfil_head_unregister(V_inet_pfil_head); 382 error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); 383 if (error != 0) { 384 printf("%s: WARNING: unable to deregister input helper hook " 385 "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " 386 "error %d returned\n", __func__, error); 387 } 388 error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); 389 if (error != 0) { 390 printf("%s: WARNING: unable to deregister output helper hook " 391 "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " 392 "error %d returned\n", __func__, error); 393 } 394 395 /* Remove the IPv4 addresses from all interfaces. */ 396 in_ifscrub_all(); 397 398 /* Make sure the IPv4 routes are gone as well. */ 399 IFNET_RLOCK(); 400 CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) 401 rt_flushifroutes_af(ifp, AF_INET); 402 IFNET_RUNLOCK(); 403 404 /* Destroy IP reassembly queue. */ 405 ipreass_destroy(); 406 407 /* Cleanup in_ifaddr hash table; should be empty. */ 408 hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); 409 } 410 411 VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); 412 #endif 413 414 #ifdef RSS 415 /* 416 * IP direct input routine. 417 * 418 * This is called when reinjecting completed fragments where 419 * all of the previous checking and book-keeping has been done. 420 */ 421 void 422 ip_direct_input(struct mbuf *m) 423 { 424 struct ip *ip; 425 int hlen; 426 427 ip = mtod(m, struct ip *); 428 hlen = ip->ip_hl << 2; 429 430 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 431 if (IPSEC_ENABLED(ipv4)) { 432 if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) 433 return; 434 } 435 #endif /* IPSEC */ 436 IPSTAT_INC(ips_delivered); 437 (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); 438 return; 439 } 440 #endif 441 442 /* 443 * Ip input routine. Checksum and byte swap header. If fragmented 444 * try to reassemble. Process options. Pass to next level. 445 */ 446 void 447 ip_input(struct mbuf *m) 448 { 449 struct rm_priotracker in_ifa_tracker; 450 struct ip *ip = NULL; 451 struct in_ifaddr *ia = NULL; 452 struct ifaddr *ifa; 453 struct ifnet *ifp; 454 int checkif, hlen = 0; 455 uint16_t sum, ip_len; 456 int dchg = 0; /* dest changed after fw */ 457 struct in_addr odst; /* original dst address */ 458 459 M_ASSERTPKTHDR(m); 460 461 if (m->m_flags & M_FASTFWD_OURS) { 462 m->m_flags &= ~M_FASTFWD_OURS; 463 /* Set up some basics that will be used later. */ 464 ip = mtod(m, struct ip *); 465 hlen = ip->ip_hl << 2; 466 ip_len = ntohs(ip->ip_len); 467 goto ours; 468 } 469 470 IPSTAT_INC(ips_total); 471 472 if (m->m_pkthdr.len < sizeof(struct ip)) 473 goto tooshort; 474 475 if (m->m_len < sizeof (struct ip) && 476 (m = m_pullup(m, sizeof (struct ip))) == NULL) { 477 IPSTAT_INC(ips_toosmall); 478 return; 479 } 480 ip = mtod(m, struct ip *); 481 482 if (ip->ip_v != IPVERSION) { 483 IPSTAT_INC(ips_badvers); 484 goto bad; 485 } 486 487 hlen = ip->ip_hl << 2; 488 if (hlen < sizeof(struct ip)) { /* minimum header length */ 489 IPSTAT_INC(ips_badhlen); 490 goto bad; 491 } 492 if (hlen > m->m_len) { 493 if ((m = m_pullup(m, hlen)) == NULL) { 494 IPSTAT_INC(ips_badhlen); 495 return; 496 } 497 ip = mtod(m, struct ip *); 498 } 499 500 IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); 501 502 /* 127/8 must not appear on wire - RFC1122 */ 503 ifp = m->m_pkthdr.rcvif; 504 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 505 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 506 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 507 IPSTAT_INC(ips_badaddr); 508 goto bad; 509 } 510 } 511 512 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 513 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); 514 } else { 515 if (hlen == sizeof(struct ip)) { 516 sum = in_cksum_hdr(ip); 517 } else { 518 sum = in_cksum(m, hlen); 519 } 520 } 521 if (sum) { 522 IPSTAT_INC(ips_badsum); 523 goto bad; 524 } 525 526 #ifdef ALTQ 527 if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) 528 /* packet is dropped by traffic conditioner */ 529 return; 530 #endif 531 532 ip_len = ntohs(ip->ip_len); 533 if (ip_len < hlen) { 534 IPSTAT_INC(ips_badlen); 535 goto bad; 536 } 537 538 /* 539 * Check that the amount of data in the buffers 540 * is as at least much as the IP header would have us expect. 541 * Trim mbufs if longer than we expect. 542 * Drop packet if shorter than we expect. 543 */ 544 if (m->m_pkthdr.len < ip_len) { 545 tooshort: 546 IPSTAT_INC(ips_tooshort); 547 goto bad; 548 } 549 if (m->m_pkthdr.len > ip_len) { 550 if (m->m_len == m->m_pkthdr.len) { 551 m->m_len = ip_len; 552 m->m_pkthdr.len = ip_len; 553 } else 554 m_adj(m, ip_len - m->m_pkthdr.len); 555 } 556 557 /* 558 * Try to forward the packet, but if we fail continue. 559 * ip_tryforward() does not generate redirects, so fall 560 * through to normal processing if redirects are required. 561 * ip_tryforward() does inbound and outbound packet firewall 562 * processing. If firewall has decided that destination becomes 563 * our local address, it sets M_FASTFWD_OURS flag. In this 564 * case skip another inbound firewall processing and update 565 * ip pointer. 566 */ 567 if (V_ipforwarding != 0 && V_ipsendredirects == 0 568 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 569 && (!IPSEC_ENABLED(ipv4) || 570 IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) 571 #endif 572 ) { 573 if ((m = ip_tryforward(m)) == NULL) 574 return; 575 if (m->m_flags & M_FASTFWD_OURS) { 576 m->m_flags &= ~M_FASTFWD_OURS; 577 ip = mtod(m, struct ip *); 578 goto ours; 579 } 580 } 581 582 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 583 /* 584 * Bypass packet filtering for packets previously handled by IPsec. 585 */ 586 if (IPSEC_ENABLED(ipv4) && 587 IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0) 588 goto passin; 589 #endif 590 591 /* 592 * Run through list of hooks for input packets. 593 * 594 * NB: Beware of the destination address changing (e.g. 595 * by NAT rewriting). When this happens, tell 596 * ip_forward to do the right thing. 597 */ 598 599 /* Jump over all PFIL processing if hooks are not active. */ 600 if (!PFIL_HOOKED_IN(V_inet_pfil_head)) 601 goto passin; 602 603 odst = ip->ip_dst; 604 if (pfil_run_hooks(V_inet_pfil_head, &m, ifp, PFIL_IN, NULL) != 605 PFIL_PASS) 606 return; 607 if (m == NULL) /* consumed by filter */ 608 return; 609 610 ip = mtod(m, struct ip *); 611 dchg = (odst.s_addr != ip->ip_dst.s_addr); 612 ifp = m->m_pkthdr.rcvif; 613 614 if (m->m_flags & M_FASTFWD_OURS) { 615 m->m_flags &= ~M_FASTFWD_OURS; 616 goto ours; 617 } 618 if (m->m_flags & M_IP_NEXTHOP) { 619 if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { 620 /* 621 * Directly ship the packet on. This allows 622 * forwarding packets originally destined to us 623 * to some other directly connected host. 624 */ 625 ip_forward(m, 1); 626 return; 627 } 628 } 629 passin: 630 631 /* 632 * Process options and, if not destined for us, 633 * ship it on. ip_dooptions returns 1 when an 634 * error was detected (causing an icmp message 635 * to be sent and the original packet to be freed). 636 */ 637 if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) 638 return; 639 640 /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no 641 * matter if it is destined to another node, or whether it is 642 * a multicast one, RSVP wants it! and prevents it from being forwarded 643 * anywhere else. Also checks if the rsvp daemon is running before 644 * grabbing the packet. 645 */ 646 if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 647 goto ours; 648 649 /* 650 * Check our list of addresses, to see if the packet is for us. 651 * If we don't have any addresses, assume any unicast packet 652 * we receive might be for us (and let the upper layers deal 653 * with it). 654 */ 655 if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && 656 (m->m_flags & (M_MCAST|M_BCAST)) == 0) 657 goto ours; 658 659 /* 660 * Enable a consistency check between the destination address 661 * and the arrival interface for a unicast packet (the RFC 1122 662 * strong ES model) if IP forwarding is disabled and the packet 663 * is not locally generated and the packet is not subject to 664 * 'ipfw fwd'. 665 * 666 * XXX - Checking also should be disabled if the destination 667 * address is ipnat'ed to a different interface. 668 * 669 * XXX - Checking is incompatible with IP aliases added 670 * to the loopback interface instead of the interface where 671 * the packets are received. 672 * 673 * XXX - This is the case for carp vhost IPs as well so we 674 * insert a workaround. If the packet got here, we already 675 * checked with carp_iamatch() and carp_forus(). 676 */ 677 checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 678 ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && 679 ifp->if_carp == NULL && (dchg == 0); 680 681 /* 682 * Check for exact addresses in the hash bucket. 683 */ 684 IN_IFADDR_RLOCK(&in_ifa_tracker); 685 LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { 686 /* 687 * If the address matches, verify that the packet 688 * arrived via the correct interface if checking is 689 * enabled. 690 */ 691 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 692 (!checkif || ia->ia_ifp == ifp)) { 693 counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); 694 counter_u64_add(ia->ia_ifa.ifa_ibytes, 695 m->m_pkthdr.len); 696 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 697 goto ours; 698 } 699 } 700 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 701 702 /* 703 * Check for broadcast addresses. 704 * 705 * Only accept broadcast packets that arrive via the matching 706 * interface. Reception of forwarded directed broadcasts would 707 * be handled via ip_forward() and ether_output() with the loopback 708 * into the stack for SIMPLEX interfaces handled by ether_output(). 709 */ 710 if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { 711 struct epoch_tracker et; 712 713 NET_EPOCH_ENTER(et); 714 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 715 if (ifa->ifa_addr->sa_family != AF_INET) 716 continue; 717 ia = ifatoia(ifa); 718 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == 719 ip->ip_dst.s_addr) { 720 counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); 721 counter_u64_add(ia->ia_ifa.ifa_ibytes, 722 m->m_pkthdr.len); 723 NET_EPOCH_EXIT(et); 724 goto ours; 725 } 726 #ifdef BOOTP_COMPAT 727 if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { 728 counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); 729 counter_u64_add(ia->ia_ifa.ifa_ibytes, 730 m->m_pkthdr.len); 731 NET_EPOCH_EXIT(et); 732 goto ours; 733 } 734 #endif 735 } 736 NET_EPOCH_EXIT(et); 737 ia = NULL; 738 } 739 /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ 740 if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { 741 IPSTAT_INC(ips_cantforward); 742 m_freem(m); 743 return; 744 } 745 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 746 if (V_ip_mrouter) { 747 /* 748 * If we are acting as a multicast router, all 749 * incoming multicast packets are passed to the 750 * kernel-level multicast forwarding function. 751 * The packet is returned (relatively) intact; if 752 * ip_mforward() returns a non-zero value, the packet 753 * must be discarded, else it may be accepted below. 754 */ 755 if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { 756 IPSTAT_INC(ips_cantforward); 757 m_freem(m); 758 return; 759 } 760 761 /* 762 * The process-level routing daemon needs to receive 763 * all multicast IGMP packets, whether or not this 764 * host belongs to their destination groups. 765 */ 766 if (ip->ip_p == IPPROTO_IGMP) 767 goto ours; 768 IPSTAT_INC(ips_forward); 769 } 770 /* 771 * Assume the packet is for us, to avoid prematurely taking 772 * a lock on the in_multi hash. Protocols must perform 773 * their own filtering and update statistics accordingly. 774 */ 775 goto ours; 776 } 777 if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) 778 goto ours; 779 if (ip->ip_dst.s_addr == INADDR_ANY) 780 goto ours; 781 782 /* 783 * Not for us; forward if possible and desirable. 784 */ 785 if (V_ipforwarding == 0) { 786 IPSTAT_INC(ips_cantforward); 787 m_freem(m); 788 } else { 789 ip_forward(m, dchg); 790 } 791 return; 792 793 ours: 794 #ifdef IPSTEALTH 795 /* 796 * IPSTEALTH: Process non-routing options only 797 * if the packet is destined for us. 798 */ 799 if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) 800 return; 801 #endif /* IPSTEALTH */ 802 803 /* 804 * Attempt reassembly; if it succeeds, proceed. 805 * ip_reass() will return a different mbuf. 806 */ 807 if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { 808 /* XXXGL: shouldn't we save & set m_flags? */ 809 m = ip_reass(m); 810 if (m == NULL) 811 return; 812 ip = mtod(m, struct ip *); 813 /* Get the header length of the reassembled packet */ 814 hlen = ip->ip_hl << 2; 815 } 816 817 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 818 if (IPSEC_ENABLED(ipv4)) { 819 if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) 820 return; 821 } 822 #endif /* IPSEC */ 823 824 /* 825 * Switch out to protocol's input routine. 826 */ 827 IPSTAT_INC(ips_delivered); 828 829 (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); 830 return; 831 bad: 832 m_freem(m); 833 } 834 835 /* 836 * IP timer processing; 837 * if a timer expires on a reassembly 838 * queue, discard it. 839 */ 840 void 841 ip_slowtimo(void) 842 { 843 VNET_ITERATOR_DECL(vnet_iter); 844 845 VNET_LIST_RLOCK_NOSLEEP(); 846 VNET_FOREACH(vnet_iter) { 847 CURVNET_SET(vnet_iter); 848 ipreass_slowtimo(); 849 CURVNET_RESTORE(); 850 } 851 VNET_LIST_RUNLOCK_NOSLEEP(); 852 } 853 854 void 855 ip_drain(void) 856 { 857 VNET_ITERATOR_DECL(vnet_iter); 858 859 VNET_LIST_RLOCK_NOSLEEP(); 860 VNET_FOREACH(vnet_iter) { 861 CURVNET_SET(vnet_iter); 862 ipreass_drain(); 863 CURVNET_RESTORE(); 864 } 865 VNET_LIST_RUNLOCK_NOSLEEP(); 866 } 867 868 /* 869 * The protocol to be inserted into ip_protox[] must be already registered 870 * in inetsw[], either statically or through pf_proto_register(). 871 */ 872 int 873 ipproto_register(short ipproto) 874 { 875 struct protosw *pr; 876 877 /* Sanity checks. */ 878 if (ipproto <= 0 || ipproto >= IPPROTO_MAX) 879 return (EPROTONOSUPPORT); 880 881 /* 882 * The protocol slot must not be occupied by another protocol 883 * already. An index pointing to IPPROTO_RAW is unused. 884 */ 885 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 886 if (pr == NULL) 887 return (EPFNOSUPPORT); 888 if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ 889 return (EEXIST); 890 891 /* Find the protocol position in inetsw[] and set the index. */ 892 for (pr = inetdomain.dom_protosw; 893 pr < inetdomain.dom_protoswNPROTOSW; pr++) { 894 if (pr->pr_domain->dom_family == PF_INET && 895 pr->pr_protocol && pr->pr_protocol == ipproto) { 896 ip_protox[pr->pr_protocol] = pr - inetsw; 897 return (0); 898 } 899 } 900 return (EPROTONOSUPPORT); 901 } 902 903 int 904 ipproto_unregister(short ipproto) 905 { 906 struct protosw *pr; 907 908 /* Sanity checks. */ 909 if (ipproto <= 0 || ipproto >= IPPROTO_MAX) 910 return (EPROTONOSUPPORT); 911 912 /* Check if the protocol was indeed registered. */ 913 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 914 if (pr == NULL) 915 return (EPFNOSUPPORT); 916 if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ 917 return (ENOENT); 918 919 /* Reset the protocol slot to IPPROTO_RAW. */ 920 ip_protox[ipproto] = pr - inetsw; 921 return (0); 922 } 923 924 u_char inetctlerrmap[PRC_NCMDS] = { 925 0, 0, 0, 0, 926 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 927 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 928 EMSGSIZE, EHOSTUNREACH, 0, 0, 929 0, 0, EHOSTUNREACH, 0, 930 ENOPROTOOPT, ECONNREFUSED 931 }; 932 933 /* 934 * Forward a packet. If some error occurs return the sender 935 * an icmp packet. Note we can't always generate a meaningful 936 * icmp message because icmp doesn't have a large enough repertoire 937 * of codes and types. 938 * 939 * If not forwarding, just drop the packet. This could be confusing 940 * if ipforwarding was zero but some routing protocol was advancing 941 * us as a gateway to somewhere. However, we must let the routing 942 * protocol deal with that. 943 * 944 * The srcrt parameter indicates whether the packet is being forwarded 945 * via a source route. 946 */ 947 void 948 ip_forward(struct mbuf *m, int srcrt) 949 { 950 struct ip *ip = mtod(m, struct ip *); 951 struct in_ifaddr *ia; 952 struct mbuf *mcopy; 953 struct sockaddr_in *sin; 954 struct in_addr dest; 955 struct route ro; 956 struct epoch_tracker et; 957 int error, type = 0, code = 0, mtu = 0; 958 959 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 960 IPSTAT_INC(ips_cantforward); 961 m_freem(m); 962 return; 963 } 964 if ( 965 #ifdef IPSTEALTH 966 V_ipstealth == 0 && 967 #endif 968 ip->ip_ttl <= IPTTLDEC) { 969 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); 970 return; 971 } 972 973 bzero(&ro, sizeof(ro)); 974 sin = (struct sockaddr_in *)&ro.ro_dst; 975 sin->sin_family = AF_INET; 976 sin->sin_len = sizeof(*sin); 977 sin->sin_addr = ip->ip_dst; 978 #ifdef RADIX_MPATH 979 rtalloc_mpath_fib(&ro, 980 ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), 981 M_GETFIB(m)); 982 #else 983 in_rtalloc_ign(&ro, 0, M_GETFIB(m)); 984 #endif 985 NET_EPOCH_ENTER(et); 986 if (ro.ro_rt != NULL) { 987 ia = ifatoia(ro.ro_rt->rt_ifa); 988 } else 989 ia = NULL; 990 /* 991 * Save the IP header and at most 8 bytes of the payload, 992 * in case we need to generate an ICMP message to the src. 993 * 994 * XXX this can be optimized a lot by saving the data in a local 995 * buffer on the stack (72 bytes at most), and only allocating the 996 * mbuf if really necessary. The vast majority of the packets 997 * are forwarded without having to send an ICMP back (either 998 * because unnecessary, or because rate limited), so we are 999 * really we are wasting a lot of work here. 1000 * 1001 * We don't use m_copym() because it might return a reference 1002 * to a shared cluster. Both this function and ip_output() 1003 * assume exclusive access to the IP header in `m', so any 1004 * data in a cluster may change before we reach icmp_error(). 1005 */ 1006 mcopy = m_gethdr(M_NOWAIT, m->m_type); 1007 if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { 1008 /* 1009 * It's probably ok if the pkthdr dup fails (because 1010 * the deep copy of the tag chain failed), but for now 1011 * be conservative and just discard the copy since 1012 * code below may some day want the tags. 1013 */ 1014 m_free(mcopy); 1015 mcopy = NULL; 1016 } 1017 if (mcopy != NULL) { 1018 mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); 1019 mcopy->m_pkthdr.len = mcopy->m_len; 1020 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); 1021 } 1022 #ifdef IPSTEALTH 1023 if (V_ipstealth == 0) 1024 #endif 1025 ip->ip_ttl -= IPTTLDEC; 1026 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1027 if (IPSEC_ENABLED(ipv4)) { 1028 if ((error = IPSEC_FORWARD(ipv4, m)) != 0) { 1029 /* mbuf consumed by IPsec */ 1030 m_freem(mcopy); 1031 if (error != EINPROGRESS) 1032 IPSTAT_INC(ips_cantforward); 1033 goto out; 1034 } 1035 /* No IPsec processing required */ 1036 } 1037 #endif /* IPSEC */ 1038 /* 1039 * If forwarding packet using same interface that it came in on, 1040 * perhaps should send a redirect to sender to shortcut a hop. 1041 * Only send redirect if source is sending directly to us, 1042 * and if packet was not source routed (or has any options). 1043 * Also, don't send redirect if forwarding using a default route 1044 * or a route modified by a redirect. 1045 */ 1046 dest.s_addr = 0; 1047 if (!srcrt && V_ipsendredirects && 1048 ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { 1049 struct rtentry *rt; 1050 1051 rt = ro.ro_rt; 1052 1053 if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1054 satosin(rt_key(rt))->sin_addr.s_addr != 0) { 1055 #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) 1056 u_long src = ntohl(ip->ip_src.s_addr); 1057 1058 if (RTA(rt) && 1059 (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { 1060 if (rt->rt_flags & RTF_GATEWAY) 1061 dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; 1062 else 1063 dest.s_addr = ip->ip_dst.s_addr; 1064 /* Router requirements says to only send host redirects */ 1065 type = ICMP_REDIRECT; 1066 code = ICMP_REDIRECT_HOST; 1067 } 1068 } 1069 } 1070 1071 error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); 1072 1073 if (error == EMSGSIZE && ro.ro_rt) 1074 mtu = ro.ro_rt->rt_mtu; 1075 RO_RTFREE(&ro); 1076 1077 if (error) 1078 IPSTAT_INC(ips_cantforward); 1079 else { 1080 IPSTAT_INC(ips_forward); 1081 if (type) 1082 IPSTAT_INC(ips_redirectsent); 1083 else { 1084 if (mcopy) 1085 m_freem(mcopy); 1086 goto out; 1087 } 1088 } 1089 if (mcopy == NULL) 1090 goto out; 1091 1092 1093 switch (error) { 1094 1095 case 0: /* forwarded, but need redirect */ 1096 /* type, code set above */ 1097 break; 1098 1099 case ENETUNREACH: 1100 case EHOSTUNREACH: 1101 case ENETDOWN: 1102 case EHOSTDOWN: 1103 default: 1104 type = ICMP_UNREACH; 1105 code = ICMP_UNREACH_HOST; 1106 break; 1107 1108 case EMSGSIZE: 1109 type = ICMP_UNREACH; 1110 code = ICMP_UNREACH_NEEDFRAG; 1111 /* 1112 * If the MTU was set before make sure we are below the 1113 * interface MTU. 1114 * If the MTU wasn't set before use the interface mtu or 1115 * fall back to the next smaller mtu step compared to the 1116 * current packet size. 1117 */ 1118 if (mtu != 0) { 1119 if (ia != NULL) 1120 mtu = min(mtu, ia->ia_ifp->if_mtu); 1121 } else { 1122 if (ia != NULL) 1123 mtu = ia->ia_ifp->if_mtu; 1124 else 1125 mtu = ip_next_mtu(ntohs(ip->ip_len), 0); 1126 } 1127 IPSTAT_INC(ips_cantfrag); 1128 break; 1129 1130 case ENOBUFS: 1131 case EACCES: /* ipfw denied packet */ 1132 m_freem(mcopy); 1133 goto out; 1134 } 1135 icmp_error(mcopy, type, code, dest.s_addr, mtu); 1136 out: 1137 NET_EPOCH_EXIT(et); 1138 } 1139 1140 #define CHECK_SO_CT(sp, ct) \ 1141 (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0) 1142 1143 void 1144 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1145 struct mbuf *m) 1146 { 1147 bool stamped; 1148 1149 stamped = false; 1150 if ((inp->inp_socket->so_options & SO_BINTIME) || 1151 CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { 1152 struct bintime boottimebin, bt; 1153 struct timespec ts1; 1154 1155 if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | 1156 M_TSTMP)) { 1157 mbuf_tstmp2timespec(m, &ts1); 1158 timespec2bintime(&ts1, &bt); 1159 getboottimebin(&boottimebin); 1160 bintime_add(&bt, &boottimebin); 1161 } else { 1162 bintime(&bt); 1163 } 1164 *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), 1165 SCM_BINTIME, SOL_SOCKET); 1166 if (*mp != NULL) { 1167 mp = &(*mp)->m_next; 1168 stamped = true; 1169 } 1170 } 1171 if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { 1172 struct bintime boottimebin, bt1; 1173 struct timespec ts1;; 1174 struct timeval tv; 1175 1176 if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | 1177 M_TSTMP)) { 1178 mbuf_tstmp2timespec(m, &ts1); 1179 timespec2bintime(&ts1, &bt1); 1180 getboottimebin(&boottimebin); 1181 bintime_add(&bt1, &boottimebin); 1182 bintime2timeval(&bt1, &tv); 1183 } else { 1184 microtime(&tv); 1185 } 1186 *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), 1187 SCM_TIMESTAMP, SOL_SOCKET); 1188 if (*mp != NULL) { 1189 mp = &(*mp)->m_next; 1190 stamped = true; 1191 } 1192 } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { 1193 struct bintime boottimebin; 1194 struct timespec ts, ts1; 1195 1196 if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | 1197 M_TSTMP)) { 1198 mbuf_tstmp2timespec(m, &ts); 1199 getboottimebin(&boottimebin); 1200 bintime2timespec(&boottimebin, &ts1); 1201 timespecadd(&ts, &ts1, &ts); 1202 } else { 1203 nanotime(&ts); 1204 } 1205 *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), 1206 SCM_REALTIME, SOL_SOCKET); 1207 if (*mp != NULL) { 1208 mp = &(*mp)->m_next; 1209 stamped = true; 1210 } 1211 } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { 1212 struct timespec ts; 1213 1214 if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | 1215 M_TSTMP)) 1216 mbuf_tstmp2timespec(m, &ts); 1217 else 1218 nanouptime(&ts); 1219 *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), 1220 SCM_MONOTONIC, SOL_SOCKET); 1221 if (*mp != NULL) { 1222 mp = &(*mp)->m_next; 1223 stamped = true; 1224 } 1225 } 1226 if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | 1227 M_TSTMP)) { 1228 struct sock_timestamp_info sti; 1229 1230 bzero(&sti, sizeof(sti)); 1231 sti.st_info_flags = ST_INFO_HW; 1232 if ((m->m_flags & M_TSTMP_HPREC) != 0) 1233 sti.st_info_flags |= ST_INFO_HW_HPREC; 1234 *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, 1235 SOL_SOCKET); 1236 if (*mp != NULL) 1237 mp = &(*mp)->m_next; 1238 } 1239 if (inp->inp_flags & INP_RECVDSTADDR) { 1240 *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, 1241 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1242 if (*mp) 1243 mp = &(*mp)->m_next; 1244 } 1245 if (inp->inp_flags & INP_RECVTTL) { 1246 *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, 1247 sizeof(u_char), IP_RECVTTL, IPPROTO_IP); 1248 if (*mp) 1249 mp = &(*mp)->m_next; 1250 } 1251 #ifdef notyet 1252 /* XXX 1253 * Moving these out of udp_input() made them even more broken 1254 * than they already were. 1255 */ 1256 /* options were tossed already */ 1257 if (inp->inp_flags & INP_RECVOPTS) { 1258 *mp = sbcreatecontrol((caddr_t)opts_deleted_above, 1259 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1260 if (*mp) 1261 mp = &(*mp)->m_next; 1262 } 1263 /* ip_srcroute doesn't do what we want here, need to fix */ 1264 if (inp->inp_flags & INP_RECVRETOPTS) { 1265 *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), 1266 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1267 if (*mp) 1268 mp = &(*mp)->m_next; 1269 } 1270 #endif 1271 if (inp->inp_flags & INP_RECVIF) { 1272 struct ifnet *ifp; 1273 struct sdlbuf { 1274 struct sockaddr_dl sdl; 1275 u_char pad[32]; 1276 } sdlbuf; 1277 struct sockaddr_dl *sdp; 1278 struct sockaddr_dl *sdl2 = &sdlbuf.sdl; 1279 1280 if ((ifp = m->m_pkthdr.rcvif) && 1281 ifp->if_index && ifp->if_index <= V_if_index) { 1282 sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; 1283 /* 1284 * Change our mind and don't try copy. 1285 */ 1286 if (sdp->sdl_family != AF_LINK || 1287 sdp->sdl_len > sizeof(sdlbuf)) { 1288 goto makedummy; 1289 } 1290 bcopy(sdp, sdl2, sdp->sdl_len); 1291 } else { 1292 makedummy: 1293 sdl2->sdl_len = 1294 offsetof(struct sockaddr_dl, sdl_data[0]); 1295 sdl2->sdl_family = AF_LINK; 1296 sdl2->sdl_index = 0; 1297 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; 1298 } 1299 *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, 1300 IP_RECVIF, IPPROTO_IP); 1301 if (*mp) 1302 mp = &(*mp)->m_next; 1303 } 1304 if (inp->inp_flags & INP_RECVTOS) { 1305 *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, 1306 sizeof(u_char), IP_RECVTOS, IPPROTO_IP); 1307 if (*mp) 1308 mp = &(*mp)->m_next; 1309 } 1310 1311 if (inp->inp_flags2 & INP_RECVFLOWID) { 1312 uint32_t flowid, flow_type; 1313 1314 flowid = m->m_pkthdr.flowid; 1315 flow_type = M_HASHTYPE_GET(m); 1316 1317 /* 1318 * XXX should handle the failure of one or the 1319 * other - don't populate both? 1320 */ 1321 *mp = sbcreatecontrol((caddr_t) &flowid, 1322 sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); 1323 if (*mp) 1324 mp = &(*mp)->m_next; 1325 *mp = sbcreatecontrol((caddr_t) &flow_type, 1326 sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); 1327 if (*mp) 1328 mp = &(*mp)->m_next; 1329 } 1330 1331 #ifdef RSS 1332 if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { 1333 uint32_t flowid, flow_type; 1334 uint32_t rss_bucketid; 1335 1336 flowid = m->m_pkthdr.flowid; 1337 flow_type = M_HASHTYPE_GET(m); 1338 1339 if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { 1340 *mp = sbcreatecontrol((caddr_t) &rss_bucketid, 1341 sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); 1342 if (*mp) 1343 mp = &(*mp)->m_next; 1344 } 1345 } 1346 #endif 1347 } 1348 1349 /* 1350 * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the 1351 * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on 1352 * locking. This code remains in ip_input.c as ip_mroute.c is optionally 1353 * compiled. 1354 */ 1355 VNET_DEFINE_STATIC(int, ip_rsvp_on); 1356 VNET_DEFINE(struct socket *, ip_rsvpd); 1357 1358 #define V_ip_rsvp_on VNET(ip_rsvp_on) 1359 1360 int 1361 ip_rsvp_init(struct socket *so) 1362 { 1363 1364 if (so->so_type != SOCK_RAW || 1365 so->so_proto->pr_protocol != IPPROTO_RSVP) 1366 return EOPNOTSUPP; 1367 1368 if (V_ip_rsvpd != NULL) 1369 return EADDRINUSE; 1370 1371 V_ip_rsvpd = so; 1372 /* 1373 * This may seem silly, but we need to be sure we don't over-increment 1374 * the RSVP counter, in case something slips up. 1375 */ 1376 if (!V_ip_rsvp_on) { 1377 V_ip_rsvp_on = 1; 1378 V_rsvp_on++; 1379 } 1380 1381 return 0; 1382 } 1383 1384 int 1385 ip_rsvp_done(void) 1386 { 1387 1388 V_ip_rsvpd = NULL; 1389 /* 1390 * This may seem silly, but we need to be sure we don't over-decrement 1391 * the RSVP counter, in case something slips up. 1392 */ 1393 if (V_ip_rsvp_on) { 1394 V_ip_rsvp_on = 0; 1395 V_rsvp_on--; 1396 } 1397 return 0; 1398 } 1399 1400 int 1401 rsvp_input(struct mbuf **mp, int *offp, int proto) 1402 { 1403 struct mbuf *m; 1404 1405 m = *mp; 1406 *mp = NULL; 1407 1408 if (rsvp_input_p) { /* call the real one if loaded */ 1409 *mp = m; 1410 rsvp_input_p(mp, offp, proto); 1411 return (IPPROTO_DONE); 1412 } 1413 1414 /* Can still get packets with rsvp_on = 0 if there is a local member 1415 * of the group to which the RSVP packet is addressed. But in this 1416 * case we want to throw the packet away. 1417 */ 1418 1419 if (!V_rsvp_on) { 1420 m_freem(m); 1421 return (IPPROTO_DONE); 1422 } 1423 1424 if (V_ip_rsvpd != NULL) { 1425 *mp = m; 1426 rip_input(mp, offp, proto); 1427 return (IPPROTO_DONE); 1428 } 1429 /* Drop the packet */ 1430 m_freem(m); 1431 return (IPPROTO_DONE); 1432 } 1433