1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 40 #include <sys/param.h> 41 #include <sys/jail.h> 42 #include <sys/kernel.h> 43 #include <sys/eventhandler.h> 44 #include <sys/lock.h> 45 #include <sys/malloc.h> 46 #include <sys/mbuf.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/protosw.h> 50 #include <sys/rmlock.h> 51 #include <sys/rwlock.h> 52 #include <sys/signalvar.h> 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <sys/sx.h> 56 #include <sys/sysctl.h> 57 #include <sys/systm.h> 58 59 #include <vm/uma.h> 60 61 #include <net/if.h> 62 #include <net/if_var.h> 63 #include <net/route.h> 64 #include <net/vnet.h> 65 66 #include <netinet/in.h> 67 #include <netinet/in_systm.h> 68 #include <netinet/in_pcb.h> 69 #include <netinet/in_var.h> 70 #include <netinet/if_ether.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_var.h> 73 #include <netinet/ip_mroute.h> 74 #include <netinet/ip_icmp.h> 75 76 #ifdef IPSEC 77 #include <netipsec/ipsec.h> 78 #endif /*IPSEC*/ 79 80 #include <machine/stdarg.h> 81 #include <security/mac/mac_framework.h> 82 83 VNET_DEFINE(int, ip_defttl) = IPDEFTTL; 84 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, 85 &VNET_NAME(ip_defttl), 0, 86 "Maximum TTL on IP packets"); 87 88 VNET_DEFINE(struct inpcbhead, ripcb); 89 VNET_DEFINE(struct inpcbinfo, ripcbinfo); 90 91 #define V_ripcb VNET(ripcb) 92 #define V_ripcbinfo VNET(ripcbinfo) 93 94 /* 95 * Control and data hooks for ipfw, dummynet, divert and so on. 96 * The data hooks are not used here but it is convenient 97 * to keep them all in one place. 98 */ 99 VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; 100 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; 101 102 int (*ip_dn_ctl_ptr)(struct sockopt *); 103 int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); 104 void (*ip_divert_ptr)(struct mbuf *, int); 105 int (*ng_ipfw_input_p)(struct mbuf **, int, 106 struct ip_fw_args *, int); 107 108 #ifdef INET 109 /* 110 * Hooks for multicast routing. They all default to NULL, so leave them not 111 * initialized and rely on BSS being set to 0. 112 */ 113 114 /* 115 * The socket used to communicate with the multicast routing daemon. 116 */ 117 VNET_DEFINE(struct socket *, ip_mrouter); 118 119 /* 120 * The various mrouter and rsvp functions. 121 */ 122 int (*ip_mrouter_set)(struct socket *, struct sockopt *); 123 int (*ip_mrouter_get)(struct socket *, struct sockopt *); 124 int (*ip_mrouter_done)(void); 125 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 126 struct ip_moptions *); 127 int (*mrt_ioctl)(u_long, caddr_t, int); 128 int (*legal_vif_num)(int); 129 u_long (*ip_mcast_src)(int); 130 131 int (*rsvp_input_p)(struct mbuf **, int *, int); 132 int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 133 void (*ip_rsvp_force_done)(struct socket *); 134 #endif /* INET */ 135 136 extern struct protosw inetsw[]; 137 138 u_long rip_sendspace = 9216; 139 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 140 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 141 142 u_long rip_recvspace = 9216; 143 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 144 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 145 146 /* 147 * Hash functions 148 */ 149 150 #define INP_PCBHASH_RAW_SIZE 256 151 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 152 (((proto) + (laddr) + (faddr)) % (mask) + 1) 153 154 #ifdef INET 155 static void 156 rip_inshash(struct inpcb *inp) 157 { 158 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 159 struct inpcbhead *pcbhash; 160 int hash; 161 162 INP_INFO_WLOCK_ASSERT(pcbinfo); 163 INP_WLOCK_ASSERT(inp); 164 165 if (inp->inp_ip_p != 0 && 166 inp->inp_laddr.s_addr != INADDR_ANY && 167 inp->inp_faddr.s_addr != INADDR_ANY) { 168 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 169 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 170 } else 171 hash = 0; 172 pcbhash = &pcbinfo->ipi_hashbase[hash]; 173 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 174 } 175 176 static void 177 rip_delhash(struct inpcb *inp) 178 { 179 180 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 181 INP_WLOCK_ASSERT(inp); 182 183 LIST_REMOVE(inp, inp_hash); 184 } 185 #endif /* INET */ 186 187 /* 188 * Raw interface to IP protocol. 189 */ 190 191 /* 192 * Initialize raw connection block q. 193 */ 194 static void 195 rip_zone_change(void *tag) 196 { 197 198 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 199 } 200 201 static int 202 rip_inpcb_init(void *mem, int size, int flags) 203 { 204 struct inpcb *inp = mem; 205 206 INP_LOCK_INIT(inp, "inp", "rawinp"); 207 return (0); 208 } 209 210 void 211 rip_init(void) 212 { 213 214 in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 215 1, "ripcb", rip_inpcb_init, NULL, 0, IPI_HASHFIELDS_NONE); 216 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 217 EVENTHANDLER_PRI_ANY); 218 } 219 220 #ifdef VIMAGE 221 static void 222 rip_destroy(void *unused __unused) 223 { 224 225 in_pcbinfo_destroy(&V_ripcbinfo); 226 } 227 VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); 228 #endif 229 230 #ifdef INET 231 static int 232 rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 233 struct sockaddr_in *ripsrc) 234 { 235 int policyfail = 0; 236 237 INP_LOCK_ASSERT(last); 238 239 #ifdef IPSEC 240 /* check AH/ESP integrity. */ 241 if (ipsec4_in_reject(n, last)) { 242 policyfail = 1; 243 } 244 #endif /* IPSEC */ 245 #ifdef MAC 246 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 247 policyfail = 1; 248 #endif 249 /* Check the minimum TTL for socket. */ 250 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 251 policyfail = 1; 252 if (!policyfail) { 253 struct mbuf *opts = NULL; 254 struct socket *so; 255 256 so = last->inp_socket; 257 if ((last->inp_flags & INP_CONTROLOPTS) || 258 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 259 ip_savecontrol(last, &opts, ip, n); 260 SOCKBUF_LOCK(&so->so_rcv); 261 if (sbappendaddr_locked(&so->so_rcv, 262 (struct sockaddr *)ripsrc, n, opts) == 0) { 263 /* should notify about lost packet */ 264 m_freem(n); 265 if (opts) 266 m_freem(opts); 267 SOCKBUF_UNLOCK(&so->so_rcv); 268 } else 269 sorwakeup_locked(so); 270 } else 271 m_freem(n); 272 return (policyfail); 273 } 274 275 /* 276 * Setup generic address and protocol structures for raw_input routine, then 277 * pass them along with mbuf chain. 278 */ 279 int 280 rip_input(struct mbuf **mp, int *offp, int proto) 281 { 282 struct ifnet *ifp; 283 struct mbuf *m = *mp; 284 struct ip *ip = mtod(m, struct ip *); 285 struct inpcb *inp, *last; 286 struct sockaddr_in ripsrc; 287 int hash; 288 289 *mp = NULL; 290 291 bzero(&ripsrc, sizeof(ripsrc)); 292 ripsrc.sin_len = sizeof(ripsrc); 293 ripsrc.sin_family = AF_INET; 294 ripsrc.sin_addr = ip->ip_src; 295 last = NULL; 296 297 ifp = m->m_pkthdr.rcvif; 298 299 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 300 ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 301 INP_INFO_RLOCK(&V_ripcbinfo); 302 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { 303 if (inp->inp_ip_p != proto) 304 continue; 305 #ifdef INET6 306 /* XXX inp locking */ 307 if ((inp->inp_vflag & INP_IPV4) == 0) 308 continue; 309 #endif 310 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 311 continue; 312 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 313 continue; 314 if (jailed_without_vnet(inp->inp_cred)) { 315 /* 316 * XXX: If faddr was bound to multicast group, 317 * jailed raw socket will drop datagram. 318 */ 319 if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 320 continue; 321 } 322 if (last != NULL) { 323 struct mbuf *n; 324 325 n = m_copym(m, 0, M_COPYALL, M_NOWAIT); 326 if (n != NULL) 327 (void) rip_append(last, ip, n, &ripsrc); 328 /* XXX count dropped packet */ 329 INP_RUNLOCK(last); 330 } 331 INP_RLOCK(inp); 332 last = inp; 333 } 334 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { 335 if (inp->inp_ip_p && inp->inp_ip_p != proto) 336 continue; 337 #ifdef INET6 338 /* XXX inp locking */ 339 if ((inp->inp_vflag & INP_IPV4) == 0) 340 continue; 341 #endif 342 if (!in_nullhost(inp->inp_laddr) && 343 !in_hosteq(inp->inp_laddr, ip->ip_dst)) 344 continue; 345 if (!in_nullhost(inp->inp_faddr) && 346 !in_hosteq(inp->inp_faddr, ip->ip_src)) 347 continue; 348 if (jailed_without_vnet(inp->inp_cred)) { 349 /* 350 * Allow raw socket in jail to receive multicast; 351 * assume process had PRIV_NETINET_RAW at attach, 352 * and fall through into normal filter path if so. 353 */ 354 if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 355 prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 356 continue; 357 } 358 /* 359 * If this raw socket has multicast state, and we 360 * have received a multicast, check if this socket 361 * should receive it, as multicast filtering is now 362 * the responsibility of the transport layer. 363 */ 364 if (inp->inp_moptions != NULL && 365 IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 366 /* 367 * If the incoming datagram is for IGMP, allow it 368 * through unconditionally to the raw socket. 369 * 370 * In the case of IGMPv2, we may not have explicitly 371 * joined the group, and may have set IFF_ALLMULTI 372 * on the interface. imo_multi_filter() may discard 373 * control traffic we actually need to see. 374 * 375 * Userland multicast routing daemons should continue 376 * filter the control traffic appropriately. 377 */ 378 int blocked; 379 380 blocked = MCAST_PASS; 381 if (proto != IPPROTO_IGMP) { 382 struct sockaddr_in group; 383 384 bzero(&group, sizeof(struct sockaddr_in)); 385 group.sin_len = sizeof(struct sockaddr_in); 386 group.sin_family = AF_INET; 387 group.sin_addr = ip->ip_dst; 388 389 blocked = imo_multi_filter(inp->inp_moptions, 390 ifp, 391 (struct sockaddr *)&group, 392 (struct sockaddr *)&ripsrc); 393 } 394 395 if (blocked != MCAST_PASS) { 396 IPSTAT_INC(ips_notmember); 397 continue; 398 } 399 } 400 if (last != NULL) { 401 struct mbuf *n; 402 403 n = m_copym(m, 0, M_COPYALL, M_NOWAIT); 404 if (n != NULL) 405 (void) rip_append(last, ip, n, &ripsrc); 406 /* XXX count dropped packet */ 407 INP_RUNLOCK(last); 408 } 409 INP_RLOCK(inp); 410 last = inp; 411 } 412 INP_INFO_RUNLOCK(&V_ripcbinfo); 413 if (last != NULL) { 414 if (rip_append(last, ip, m, &ripsrc) != 0) 415 IPSTAT_INC(ips_delivered); 416 INP_RUNLOCK(last); 417 } else { 418 if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { 419 IPSTAT_INC(ips_noproto); 420 IPSTAT_DEC(ips_delivered); 421 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); 422 } else { 423 m_freem(m); 424 } 425 } 426 return (IPPROTO_DONE); 427 } 428 429 /* 430 * Generate IP header and pass packet to ip_output. Tack on options user may 431 * have setup with control call. 432 */ 433 int 434 rip_output(struct mbuf *m, struct socket *so, ...) 435 { 436 struct ip *ip; 437 int error; 438 struct inpcb *inp = sotoinpcb(so); 439 va_list ap; 440 u_long dst; 441 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 442 IP_ALLOWBROADCAST; 443 444 va_start(ap, so); 445 dst = va_arg(ap, u_long); 446 va_end(ap); 447 448 /* 449 * If the user handed us a complete IP packet, use it. Otherwise, 450 * allocate an mbuf for a header and fill it in. 451 */ 452 if ((inp->inp_flags & INP_HDRINCL) == 0) { 453 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 454 m_freem(m); 455 return(EMSGSIZE); 456 } 457 M_PREPEND(m, sizeof(struct ip), M_NOWAIT); 458 if (m == NULL) 459 return(ENOBUFS); 460 461 INP_RLOCK(inp); 462 ip = mtod(m, struct ip *); 463 ip->ip_tos = inp->inp_ip_tos; 464 if (inp->inp_flags & INP_DONTFRAG) 465 ip->ip_off = htons(IP_DF); 466 else 467 ip->ip_off = htons(0); 468 ip->ip_p = inp->inp_ip_p; 469 ip->ip_len = htons(m->m_pkthdr.len); 470 ip->ip_src = inp->inp_laddr; 471 ip->ip_dst.s_addr = dst; 472 if (jailed(inp->inp_cred)) { 473 /* 474 * prison_local_ip4() would be good enough but would 475 * let a source of INADDR_ANY pass, which we do not 476 * want to see from jails. 477 */ 478 if (ip->ip_src.s_addr == INADDR_ANY) { 479 error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, 480 inp->inp_cred); 481 } else { 482 error = prison_local_ip4(inp->inp_cred, 483 &ip->ip_src); 484 } 485 if (error != 0) { 486 INP_RUNLOCK(inp); 487 m_freem(m); 488 return (error); 489 } 490 } 491 ip->ip_ttl = inp->inp_ip_ttl; 492 } else { 493 if (m->m_pkthdr.len > IP_MAXPACKET) { 494 m_freem(m); 495 return(EMSGSIZE); 496 } 497 INP_RLOCK(inp); 498 ip = mtod(m, struct ip *); 499 error = prison_check_ip4(inp->inp_cred, &ip->ip_src); 500 if (error != 0) { 501 INP_RUNLOCK(inp); 502 m_freem(m); 503 return (error); 504 } 505 506 /* 507 * Don't allow both user specified and setsockopt options, 508 * and don't allow packet length sizes that will crash. 509 */ 510 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 511 || (ntohs(ip->ip_len) != m->m_pkthdr.len) 512 || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { 513 INP_RUNLOCK(inp); 514 m_freem(m); 515 return (EINVAL); 516 } 517 /* 518 * This doesn't allow application to specify ID of zero, 519 * but we got this limitation from the beginning of history. 520 */ 521 if (ip->ip_id == 0) 522 ip_fillid(ip); 523 524 /* 525 * XXX prevent ip_output from overwriting header fields. 526 */ 527 flags |= IP_RAWOUTPUT; 528 IPSTAT_INC(ips_rawout); 529 } 530 531 if (inp->inp_flags & INP_ONESBCAST) 532 flags |= IP_SENDONES; 533 534 #ifdef MAC 535 mac_inpcb_create_mbuf(inp, m); 536 #endif 537 538 error = ip_output(m, inp->inp_options, NULL, flags, 539 inp->inp_moptions, inp); 540 INP_RUNLOCK(inp); 541 return (error); 542 } 543 544 /* 545 * Raw IP socket option processing. 546 * 547 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 548 * only be created by a privileged process, and as such, socket option 549 * operations to manage system properties on any raw socket were allowed to 550 * take place without explicit additional access control checks. However, 551 * raw sockets can now also be created in jail(), and therefore explicit 552 * checks are now required. Likewise, raw sockets can be used by a process 553 * after it gives up privilege, so some caution is required. For options 554 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 555 * performed in ip_ctloutput() and therefore no check occurs here. 556 * Unilaterally checking priv_check() here breaks normal IP socket option 557 * operations on raw sockets. 558 * 559 * When adding new socket options here, make sure to add access control 560 * checks here as necessary. 561 * 562 * XXX-BZ inp locking? 563 */ 564 int 565 rip_ctloutput(struct socket *so, struct sockopt *sopt) 566 { 567 struct inpcb *inp = sotoinpcb(so); 568 int error, optval; 569 570 if (sopt->sopt_level != IPPROTO_IP) { 571 if ((sopt->sopt_level == SOL_SOCKET) && 572 (sopt->sopt_name == SO_SETFIB)) { 573 inp->inp_inc.inc_fibnum = so->so_fibnum; 574 return (0); 575 } 576 return (EINVAL); 577 } 578 579 error = 0; 580 switch (sopt->sopt_dir) { 581 case SOPT_GET: 582 switch (sopt->sopt_name) { 583 case IP_HDRINCL: 584 optval = inp->inp_flags & INP_HDRINCL; 585 error = sooptcopyout(sopt, &optval, sizeof optval); 586 break; 587 588 case IP_FW3: /* generic ipfw v.3 functions */ 589 case IP_FW_ADD: /* ADD actually returns the body... */ 590 case IP_FW_GET: 591 case IP_FW_TABLE_GETSIZE: 592 case IP_FW_TABLE_LIST: 593 case IP_FW_NAT_GET_CONFIG: 594 case IP_FW_NAT_GET_LOG: 595 if (V_ip_fw_ctl_ptr != NULL) 596 error = V_ip_fw_ctl_ptr(sopt); 597 else 598 error = ENOPROTOOPT; 599 break; 600 601 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 602 case IP_DUMMYNET_GET: 603 if (ip_dn_ctl_ptr != NULL) 604 error = ip_dn_ctl_ptr(sopt); 605 else 606 error = ENOPROTOOPT; 607 break ; 608 609 case MRT_INIT: 610 case MRT_DONE: 611 case MRT_ADD_VIF: 612 case MRT_DEL_VIF: 613 case MRT_ADD_MFC: 614 case MRT_DEL_MFC: 615 case MRT_VERSION: 616 case MRT_ASSERT: 617 case MRT_API_SUPPORT: 618 case MRT_API_CONFIG: 619 case MRT_ADD_BW_UPCALL: 620 case MRT_DEL_BW_UPCALL: 621 error = priv_check(curthread, PRIV_NETINET_MROUTE); 622 if (error != 0) 623 return (error); 624 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 625 EOPNOTSUPP; 626 break; 627 628 default: 629 error = ip_ctloutput(so, sopt); 630 break; 631 } 632 break; 633 634 case SOPT_SET: 635 switch (sopt->sopt_name) { 636 case IP_HDRINCL: 637 error = sooptcopyin(sopt, &optval, sizeof optval, 638 sizeof optval); 639 if (error) 640 break; 641 if (optval) 642 inp->inp_flags |= INP_HDRINCL; 643 else 644 inp->inp_flags &= ~INP_HDRINCL; 645 break; 646 647 case IP_FW3: /* generic ipfw v.3 functions */ 648 case IP_FW_ADD: 649 case IP_FW_DEL: 650 case IP_FW_FLUSH: 651 case IP_FW_ZERO: 652 case IP_FW_RESETLOG: 653 case IP_FW_TABLE_ADD: 654 case IP_FW_TABLE_DEL: 655 case IP_FW_TABLE_FLUSH: 656 case IP_FW_NAT_CFG: 657 case IP_FW_NAT_DEL: 658 if (V_ip_fw_ctl_ptr != NULL) 659 error = V_ip_fw_ctl_ptr(sopt); 660 else 661 error = ENOPROTOOPT; 662 break; 663 664 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 665 case IP_DUMMYNET_CONFIGURE: 666 case IP_DUMMYNET_DEL: 667 case IP_DUMMYNET_FLUSH: 668 if (ip_dn_ctl_ptr != NULL) 669 error = ip_dn_ctl_ptr(sopt); 670 else 671 error = ENOPROTOOPT ; 672 break ; 673 674 case IP_RSVP_ON: 675 error = priv_check(curthread, PRIV_NETINET_MROUTE); 676 if (error != 0) 677 return (error); 678 error = ip_rsvp_init(so); 679 break; 680 681 case IP_RSVP_OFF: 682 error = priv_check(curthread, PRIV_NETINET_MROUTE); 683 if (error != 0) 684 return (error); 685 error = ip_rsvp_done(); 686 break; 687 688 case IP_RSVP_VIF_ON: 689 case IP_RSVP_VIF_OFF: 690 error = priv_check(curthread, PRIV_NETINET_MROUTE); 691 if (error != 0) 692 return (error); 693 error = ip_rsvp_vif ? 694 ip_rsvp_vif(so, sopt) : EINVAL; 695 break; 696 697 case MRT_INIT: 698 case MRT_DONE: 699 case MRT_ADD_VIF: 700 case MRT_DEL_VIF: 701 case MRT_ADD_MFC: 702 case MRT_DEL_MFC: 703 case MRT_VERSION: 704 case MRT_ASSERT: 705 case MRT_API_SUPPORT: 706 case MRT_API_CONFIG: 707 case MRT_ADD_BW_UPCALL: 708 case MRT_DEL_BW_UPCALL: 709 error = priv_check(curthread, PRIV_NETINET_MROUTE); 710 if (error != 0) 711 return (error); 712 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 713 EOPNOTSUPP; 714 break; 715 716 default: 717 error = ip_ctloutput(so, sopt); 718 break; 719 } 720 break; 721 } 722 723 return (error); 724 } 725 726 /* 727 * This function exists solely to receive the PRC_IFDOWN messages which are 728 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 729 * in_ifadown() to remove all routes corresponding to that address. It also 730 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 731 * routes. 732 */ 733 void 734 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 735 { 736 struct rm_priotracker in_ifa_tracker; 737 struct in_ifaddr *ia; 738 struct ifnet *ifp; 739 int err; 740 int flags; 741 742 switch (cmd) { 743 case PRC_IFDOWN: 744 IN_IFADDR_RLOCK(&in_ifa_tracker); 745 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 746 if (ia->ia_ifa.ifa_addr == sa 747 && (ia->ia_flags & IFA_ROUTE)) { 748 ifa_ref(&ia->ia_ifa); 749 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 750 /* 751 * in_scrubprefix() kills the interface route. 752 */ 753 in_scrubprefix(ia, 0); 754 /* 755 * in_ifadown gets rid of all the rest of the 756 * routes. This is not quite the right thing 757 * to do, but at least if we are running a 758 * routing process they will come back. 759 */ 760 in_ifadown(&ia->ia_ifa, 0); 761 ifa_free(&ia->ia_ifa); 762 break; 763 } 764 } 765 if (ia == NULL) /* If ia matched, already unlocked. */ 766 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 767 break; 768 769 case PRC_IFUP: 770 IN_IFADDR_RLOCK(&in_ifa_tracker); 771 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 772 if (ia->ia_ifa.ifa_addr == sa) 773 break; 774 } 775 if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { 776 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 777 return; 778 } 779 ifa_ref(&ia->ia_ifa); 780 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 781 flags = RTF_UP; 782 ifp = ia->ia_ifa.ifa_ifp; 783 784 if ((ifp->if_flags & IFF_LOOPBACK) 785 || (ifp->if_flags & IFF_POINTOPOINT)) 786 flags |= RTF_HOST; 787 788 err = ifa_del_loopback_route((struct ifaddr *)ia, sa); 789 790 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 791 if (err == 0) 792 ia->ia_flags |= IFA_ROUTE; 793 794 err = ifa_add_loopback_route((struct ifaddr *)ia, sa); 795 796 ifa_free(&ia->ia_ifa); 797 break; 798 } 799 } 800 801 static int 802 rip_attach(struct socket *so, int proto, struct thread *td) 803 { 804 struct inpcb *inp; 805 int error; 806 807 inp = sotoinpcb(so); 808 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 809 810 error = priv_check(td, PRIV_NETINET_RAW); 811 if (error) 812 return (error); 813 if (proto >= IPPROTO_MAX || proto < 0) 814 return EPROTONOSUPPORT; 815 error = soreserve(so, rip_sendspace, rip_recvspace); 816 if (error) 817 return (error); 818 INP_INFO_WLOCK(&V_ripcbinfo); 819 error = in_pcballoc(so, &V_ripcbinfo); 820 if (error) { 821 INP_INFO_WUNLOCK(&V_ripcbinfo); 822 return (error); 823 } 824 inp = (struct inpcb *)so->so_pcb; 825 inp->inp_vflag |= INP_IPV4; 826 inp->inp_ip_p = proto; 827 inp->inp_ip_ttl = V_ip_defttl; 828 rip_inshash(inp); 829 INP_INFO_WUNLOCK(&V_ripcbinfo); 830 INP_WUNLOCK(inp); 831 return (0); 832 } 833 834 static void 835 rip_detach(struct socket *so) 836 { 837 struct inpcb *inp; 838 839 inp = sotoinpcb(so); 840 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 841 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 842 ("rip_detach: not closed")); 843 844 INP_INFO_WLOCK(&V_ripcbinfo); 845 INP_WLOCK(inp); 846 rip_delhash(inp); 847 if (so == V_ip_mrouter && ip_mrouter_done) 848 ip_mrouter_done(); 849 if (ip_rsvp_force_done) 850 ip_rsvp_force_done(so); 851 if (so == V_ip_rsvpd) 852 ip_rsvp_done(); 853 in_pcbdetach(inp); 854 in_pcbfree(inp); 855 INP_INFO_WUNLOCK(&V_ripcbinfo); 856 } 857 858 static void 859 rip_dodisconnect(struct socket *so, struct inpcb *inp) 860 { 861 struct inpcbinfo *pcbinfo; 862 863 pcbinfo = inp->inp_pcbinfo; 864 INP_INFO_WLOCK(pcbinfo); 865 INP_WLOCK(inp); 866 rip_delhash(inp); 867 inp->inp_faddr.s_addr = INADDR_ANY; 868 rip_inshash(inp); 869 SOCK_LOCK(so); 870 so->so_state &= ~SS_ISCONNECTED; 871 SOCK_UNLOCK(so); 872 INP_WUNLOCK(inp); 873 INP_INFO_WUNLOCK(pcbinfo); 874 } 875 876 static void 877 rip_abort(struct socket *so) 878 { 879 struct inpcb *inp; 880 881 inp = sotoinpcb(so); 882 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 883 884 rip_dodisconnect(so, inp); 885 } 886 887 static void 888 rip_close(struct socket *so) 889 { 890 struct inpcb *inp; 891 892 inp = sotoinpcb(so); 893 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 894 895 rip_dodisconnect(so, inp); 896 } 897 898 static int 899 rip_disconnect(struct socket *so) 900 { 901 struct inpcb *inp; 902 903 if ((so->so_state & SS_ISCONNECTED) == 0) 904 return (ENOTCONN); 905 906 inp = sotoinpcb(so); 907 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 908 909 rip_dodisconnect(so, inp); 910 return (0); 911 } 912 913 static int 914 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 915 { 916 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 917 struct inpcb *inp; 918 int error; 919 920 if (nam->sa_len != sizeof(*addr)) 921 return (EINVAL); 922 923 error = prison_check_ip4(td->td_ucred, &addr->sin_addr); 924 if (error != 0) 925 return (error); 926 927 inp = sotoinpcb(so); 928 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 929 930 if (TAILQ_EMPTY(&V_ifnet) || 931 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 932 (addr->sin_addr.s_addr && 933 (inp->inp_flags & INP_BINDANY) == 0 && 934 ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) 935 return (EADDRNOTAVAIL); 936 937 INP_INFO_WLOCK(&V_ripcbinfo); 938 INP_WLOCK(inp); 939 rip_delhash(inp); 940 inp->inp_laddr = addr->sin_addr; 941 rip_inshash(inp); 942 INP_WUNLOCK(inp); 943 INP_INFO_WUNLOCK(&V_ripcbinfo); 944 return (0); 945 } 946 947 static int 948 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 949 { 950 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 951 struct inpcb *inp; 952 953 if (nam->sa_len != sizeof(*addr)) 954 return (EINVAL); 955 if (TAILQ_EMPTY(&V_ifnet)) 956 return (EADDRNOTAVAIL); 957 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 958 return (EAFNOSUPPORT); 959 960 inp = sotoinpcb(so); 961 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 962 963 INP_INFO_WLOCK(&V_ripcbinfo); 964 INP_WLOCK(inp); 965 rip_delhash(inp); 966 inp->inp_faddr = addr->sin_addr; 967 rip_inshash(inp); 968 soisconnected(so); 969 INP_WUNLOCK(inp); 970 INP_INFO_WUNLOCK(&V_ripcbinfo); 971 return (0); 972 } 973 974 static int 975 rip_shutdown(struct socket *so) 976 { 977 struct inpcb *inp; 978 979 inp = sotoinpcb(so); 980 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 981 982 INP_WLOCK(inp); 983 socantsendmore(so); 984 INP_WUNLOCK(inp); 985 return (0); 986 } 987 988 static int 989 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 990 struct mbuf *control, struct thread *td) 991 { 992 struct inpcb *inp; 993 u_long dst; 994 995 inp = sotoinpcb(so); 996 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 997 998 /* 999 * Note: 'dst' reads below are unlocked. 1000 */ 1001 if (so->so_state & SS_ISCONNECTED) { 1002 if (nam) { 1003 m_freem(m); 1004 return (EISCONN); 1005 } 1006 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 1007 } else { 1008 if (nam == NULL) { 1009 m_freem(m); 1010 return (ENOTCONN); 1011 } 1012 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 1013 } 1014 return (rip_output(m, so, dst)); 1015 } 1016 #endif /* INET */ 1017 1018 static int 1019 rip_pcblist(SYSCTL_HANDLER_ARGS) 1020 { 1021 int error, i, n; 1022 struct inpcb *inp, **inp_list; 1023 inp_gen_t gencnt; 1024 struct xinpgen xig; 1025 1026 /* 1027 * The process of preparing the TCB list is too time-consuming and 1028 * resource-intensive to repeat twice on every request. 1029 */ 1030 if (req->oldptr == 0) { 1031 n = V_ripcbinfo.ipi_count; 1032 n += imax(n / 8, 10); 1033 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 1034 return (0); 1035 } 1036 1037 if (req->newptr != 0) 1038 return (EPERM); 1039 1040 /* 1041 * OK, now we're committed to doing something. 1042 */ 1043 INP_INFO_RLOCK(&V_ripcbinfo); 1044 gencnt = V_ripcbinfo.ipi_gencnt; 1045 n = V_ripcbinfo.ipi_count; 1046 INP_INFO_RUNLOCK(&V_ripcbinfo); 1047 1048 xig.xig_len = sizeof xig; 1049 xig.xig_count = n; 1050 xig.xig_gen = gencnt; 1051 xig.xig_sogen = so_gencnt; 1052 error = SYSCTL_OUT(req, &xig, sizeof xig); 1053 if (error) 1054 return (error); 1055 1056 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1057 if (inp_list == NULL) 1058 return (ENOMEM); 1059 1060 INP_INFO_RLOCK(&V_ripcbinfo); 1061 for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; 1062 inp = LIST_NEXT(inp, inp_list)) { 1063 INP_WLOCK(inp); 1064 if (inp->inp_gencnt <= gencnt && 1065 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 1066 in_pcbref(inp); 1067 inp_list[i++] = inp; 1068 } 1069 INP_WUNLOCK(inp); 1070 } 1071 INP_INFO_RUNLOCK(&V_ripcbinfo); 1072 n = i; 1073 1074 error = 0; 1075 for (i = 0; i < n; i++) { 1076 inp = inp_list[i]; 1077 INP_RLOCK(inp); 1078 if (inp->inp_gencnt <= gencnt) { 1079 struct xinpcb xi; 1080 1081 bzero(&xi, sizeof(xi)); 1082 xi.xi_len = sizeof xi; 1083 /* XXX should avoid extra copy */ 1084 bcopy(inp, &xi.xi_inp, sizeof *inp); 1085 if (inp->inp_socket) 1086 sotoxsocket(inp->inp_socket, &xi.xi_socket); 1087 INP_RUNLOCK(inp); 1088 error = SYSCTL_OUT(req, &xi, sizeof xi); 1089 } else 1090 INP_RUNLOCK(inp); 1091 } 1092 INP_INFO_WLOCK(&V_ripcbinfo); 1093 for (i = 0; i < n; i++) { 1094 inp = inp_list[i]; 1095 INP_RLOCK(inp); 1096 if (!in_pcbrele_rlocked(inp)) 1097 INP_RUNLOCK(inp); 1098 } 1099 INP_INFO_WUNLOCK(&V_ripcbinfo); 1100 1101 if (!error) { 1102 /* 1103 * Give the user an updated idea of our state. If the 1104 * generation differs from what we told her before, she knows 1105 * that something happened while we were processing this 1106 * request, and it might be necessary to retry. 1107 */ 1108 INP_INFO_RLOCK(&V_ripcbinfo); 1109 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1110 xig.xig_sogen = so_gencnt; 1111 xig.xig_count = V_ripcbinfo.ipi_count; 1112 INP_INFO_RUNLOCK(&V_ripcbinfo); 1113 error = SYSCTL_OUT(req, &xig, sizeof xig); 1114 } 1115 free(inp_list, M_TEMP); 1116 return (error); 1117 } 1118 1119 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, 1120 CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, 1121 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 1122 1123 #ifdef INET 1124 struct pr_usrreqs rip_usrreqs = { 1125 .pru_abort = rip_abort, 1126 .pru_attach = rip_attach, 1127 .pru_bind = rip_bind, 1128 .pru_connect = rip_connect, 1129 .pru_control = in_control, 1130 .pru_detach = rip_detach, 1131 .pru_disconnect = rip_disconnect, 1132 .pru_peeraddr = in_getpeeraddr, 1133 .pru_send = rip_send, 1134 .pru_shutdown = rip_shutdown, 1135 .pru_sockaddr = in_getsockaddr, 1136 .pru_sosetlabel = in_pcbsosetlabel, 1137 .pru_close = rip_close, 1138 }; 1139 #endif /* INET */ 1140