1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 40 #include <sys/param.h> 41 #include <sys/jail.h> 42 #include <sys/kernel.h> 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/priv.h> 47 #include <sys/proc.h> 48 #include <sys/protosw.h> 49 #include <sys/rwlock.h> 50 #include <sys/signalvar.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/sx.h> 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 57 #include <vm/uma.h> 58 59 #include <net/if.h> 60 #include <net/route.h> 61 #include <net/vnet.h> 62 63 #include <netinet/in.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/in_pcb.h> 66 #include <netinet/in_var.h> 67 #include <netinet/if_ether.h> 68 #include <netinet/ip.h> 69 #include <netinet/ip_var.h> 70 #include <netinet/ip_mroute.h> 71 72 #ifdef IPSEC 73 #include <netipsec/ipsec.h> 74 #endif /*IPSEC*/ 75 76 #include <security/mac/mac_framework.h> 77 78 VNET_DEFINE(int, ip_defttl) = IPDEFTTL; 79 SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, 80 &VNET_NAME(ip_defttl), 0, 81 "Maximum TTL on IP packets"); 82 83 VNET_DEFINE(struct inpcbhead, ripcb); 84 VNET_DEFINE(struct inpcbinfo, ripcbinfo); 85 86 #define V_ripcb VNET(ripcb) 87 #define V_ripcbinfo VNET(ripcbinfo) 88 89 /* 90 * Control and data hooks for ipfw, dummynet, divert and so on. 91 * The data hooks are not used here but it is convenient 92 * to keep them all in one place. 93 */ 94 VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; 95 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; 96 97 int (*ip_dn_ctl_ptr)(struct sockopt *); 98 int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); 99 void (*ip_divert_ptr)(struct mbuf *, int); 100 int (*ng_ipfw_input_p)(struct mbuf **, int, 101 struct ip_fw_args *, int); 102 103 #ifdef INET 104 /* 105 * Hooks for multicast routing. They all default to NULL, so leave them not 106 * initialized and rely on BSS being set to 0. 107 */ 108 109 /* 110 * The socket used to communicate with the multicast routing daemon. 111 */ 112 VNET_DEFINE(struct socket *, ip_mrouter); 113 114 /* 115 * The various mrouter and rsvp functions. 116 */ 117 int (*ip_mrouter_set)(struct socket *, struct sockopt *); 118 int (*ip_mrouter_get)(struct socket *, struct sockopt *); 119 int (*ip_mrouter_done)(void); 120 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 121 struct ip_moptions *); 122 int (*mrt_ioctl)(u_long, caddr_t, int); 123 int (*legal_vif_num)(int); 124 u_long (*ip_mcast_src)(int); 125 126 void (*rsvp_input_p)(struct mbuf *m, int off); 127 int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 128 void (*ip_rsvp_force_done)(struct socket *); 129 #endif /* INET */ 130 131 u_long rip_sendspace = 9216; 132 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 133 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 134 135 u_long rip_recvspace = 9216; 136 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 137 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 138 139 /* 140 * Hash functions 141 */ 142 143 #define INP_PCBHASH_RAW_SIZE 256 144 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 145 (((proto) + (laddr) + (faddr)) % (mask) + 1) 146 147 #ifdef INET 148 static void 149 rip_inshash(struct inpcb *inp) 150 { 151 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 152 struct inpcbhead *pcbhash; 153 int hash; 154 155 INP_INFO_WLOCK_ASSERT(pcbinfo); 156 INP_WLOCK_ASSERT(inp); 157 158 if (inp->inp_ip_p != 0 && 159 inp->inp_laddr.s_addr != INADDR_ANY && 160 inp->inp_faddr.s_addr != INADDR_ANY) { 161 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 162 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 163 } else 164 hash = 0; 165 pcbhash = &pcbinfo->ipi_hashbase[hash]; 166 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 167 } 168 169 static void 170 rip_delhash(struct inpcb *inp) 171 { 172 173 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 174 INP_WLOCK_ASSERT(inp); 175 176 LIST_REMOVE(inp, inp_hash); 177 } 178 #endif /* INET */ 179 180 /* 181 * Raw interface to IP protocol. 182 */ 183 184 /* 185 * Initialize raw connection block q. 186 */ 187 static void 188 rip_zone_change(void *tag) 189 { 190 191 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 192 } 193 194 static int 195 rip_inpcb_init(void *mem, int size, int flags) 196 { 197 struct inpcb *inp = mem; 198 199 INP_LOCK_INIT(inp, "inp", "rawinp"); 200 return (0); 201 } 202 203 void 204 rip_init(void) 205 { 206 207 in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 208 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, 209 IPI_HASHFIELDS_NONE); 210 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 211 EVENTHANDLER_PRI_ANY); 212 } 213 214 #ifdef VIMAGE 215 void 216 rip_destroy(void) 217 { 218 219 in_pcbinfo_destroy(&V_ripcbinfo); 220 } 221 #endif 222 223 #ifdef INET 224 static int 225 rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 226 struct sockaddr_in *ripsrc) 227 { 228 int policyfail = 0; 229 230 INP_LOCK_ASSERT(last); 231 232 #ifdef IPSEC 233 /* check AH/ESP integrity. */ 234 if (ipsec4_in_reject(n, last)) { 235 policyfail = 1; 236 } 237 #endif /* IPSEC */ 238 #ifdef MAC 239 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 240 policyfail = 1; 241 #endif 242 /* Check the minimum TTL for socket. */ 243 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 244 policyfail = 1; 245 if (!policyfail) { 246 struct mbuf *opts = NULL; 247 struct socket *so; 248 249 so = last->inp_socket; 250 if ((last->inp_flags & INP_CONTROLOPTS) || 251 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 252 ip_savecontrol(last, &opts, ip, n); 253 SOCKBUF_LOCK(&so->so_rcv); 254 if (sbappendaddr_locked(&so->so_rcv, 255 (struct sockaddr *)ripsrc, n, opts) == 0) { 256 /* should notify about lost packet */ 257 m_freem(n); 258 if (opts) 259 m_freem(opts); 260 SOCKBUF_UNLOCK(&so->so_rcv); 261 } else 262 sorwakeup_locked(so); 263 } else 264 m_freem(n); 265 return (policyfail); 266 } 267 268 /* 269 * Setup generic address and protocol structures for raw_input routine, then 270 * pass them along with mbuf chain. 271 */ 272 void 273 rip_input(struct mbuf *m, int off) 274 { 275 struct ifnet *ifp; 276 struct ip *ip = mtod(m, struct ip *); 277 int proto = ip->ip_p; 278 struct inpcb *inp, *last; 279 struct sockaddr_in ripsrc; 280 int hash; 281 282 bzero(&ripsrc, sizeof(ripsrc)); 283 ripsrc.sin_len = sizeof(ripsrc); 284 ripsrc.sin_family = AF_INET; 285 ripsrc.sin_addr = ip->ip_src; 286 last = NULL; 287 288 ifp = m->m_pkthdr.rcvif; 289 /* 290 * Applications on raw sockets expect host byte order. 291 */ 292 ip->ip_len = ntohs(ip->ip_len); 293 ip->ip_off = ntohs(ip->ip_off); 294 295 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 296 ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 297 INP_INFO_RLOCK(&V_ripcbinfo); 298 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { 299 if (inp->inp_ip_p != proto) 300 continue; 301 #ifdef INET6 302 /* XXX inp locking */ 303 if ((inp->inp_vflag & INP_IPV4) == 0) 304 continue; 305 #endif 306 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 307 continue; 308 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 309 continue; 310 if (jailed_without_vnet(inp->inp_cred)) { 311 /* 312 * XXX: If faddr was bound to multicast group, 313 * jailed raw socket will drop datagram. 314 */ 315 if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 316 continue; 317 } 318 if (last != NULL) { 319 struct mbuf *n; 320 321 n = m_copy(m, 0, (int)M_COPYALL); 322 if (n != NULL) 323 (void) rip_append(last, ip, n, &ripsrc); 324 /* XXX count dropped packet */ 325 INP_RUNLOCK(last); 326 } 327 INP_RLOCK(inp); 328 last = inp; 329 } 330 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { 331 if (inp->inp_ip_p && inp->inp_ip_p != proto) 332 continue; 333 #ifdef INET6 334 /* XXX inp locking */ 335 if ((inp->inp_vflag & INP_IPV4) == 0) 336 continue; 337 #endif 338 if (!in_nullhost(inp->inp_laddr) && 339 !in_hosteq(inp->inp_laddr, ip->ip_dst)) 340 continue; 341 if (!in_nullhost(inp->inp_faddr) && 342 !in_hosteq(inp->inp_faddr, ip->ip_src)) 343 continue; 344 if (jailed_without_vnet(inp->inp_cred)) { 345 /* 346 * Allow raw socket in jail to receive multicast; 347 * assume process had PRIV_NETINET_RAW at attach, 348 * and fall through into normal filter path if so. 349 */ 350 if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 351 prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 352 continue; 353 } 354 /* 355 * If this raw socket has multicast state, and we 356 * have received a multicast, check if this socket 357 * should receive it, as multicast filtering is now 358 * the responsibility of the transport layer. 359 */ 360 if (inp->inp_moptions != NULL && 361 IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 362 /* 363 * If the incoming datagram is for IGMP, allow it 364 * through unconditionally to the raw socket. 365 * 366 * In the case of IGMPv2, we may not have explicitly 367 * joined the group, and may have set IFF_ALLMULTI 368 * on the interface. imo_multi_filter() may discard 369 * control traffic we actually need to see. 370 * 371 * Userland multicast routing daemons should continue 372 * filter the control traffic appropriately. 373 */ 374 int blocked; 375 376 blocked = MCAST_PASS; 377 if (proto != IPPROTO_IGMP) { 378 struct sockaddr_in group; 379 380 bzero(&group, sizeof(struct sockaddr_in)); 381 group.sin_len = sizeof(struct sockaddr_in); 382 group.sin_family = AF_INET; 383 group.sin_addr = ip->ip_dst; 384 385 blocked = imo_multi_filter(inp->inp_moptions, 386 ifp, 387 (struct sockaddr *)&group, 388 (struct sockaddr *)&ripsrc); 389 } 390 391 if (blocked != MCAST_PASS) { 392 IPSTAT_INC(ips_notmember); 393 continue; 394 } 395 } 396 if (last != NULL) { 397 struct mbuf *n; 398 399 n = m_copy(m, 0, (int)M_COPYALL); 400 if (n != NULL) 401 (void) rip_append(last, ip, n, &ripsrc); 402 /* XXX count dropped packet */ 403 INP_RUNLOCK(last); 404 } 405 INP_RLOCK(inp); 406 last = inp; 407 } 408 INP_INFO_RUNLOCK(&V_ripcbinfo); 409 if (last != NULL) { 410 if (rip_append(last, ip, m, &ripsrc) != 0) 411 IPSTAT_INC(ips_delivered); 412 INP_RUNLOCK(last); 413 } else { 414 m_freem(m); 415 IPSTAT_INC(ips_noproto); 416 IPSTAT_DEC(ips_delivered); 417 } 418 } 419 420 /* 421 * Generate IP header and pass packet to ip_output. Tack on options user may 422 * have setup with control call. 423 */ 424 int 425 rip_output(struct mbuf *m, struct socket *so, u_long dst) 426 { 427 struct ip *ip; 428 int error; 429 struct inpcb *inp = sotoinpcb(so); 430 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 431 IP_ALLOWBROADCAST; 432 433 /* 434 * If the user handed us a complete IP packet, use it. Otherwise, 435 * allocate an mbuf for a header and fill it in. 436 */ 437 if ((inp->inp_flags & INP_HDRINCL) == 0) { 438 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 439 m_freem(m); 440 return(EMSGSIZE); 441 } 442 M_PREPEND(m, sizeof(struct ip), M_NOWAIT); 443 if (m == NULL) 444 return(ENOBUFS); 445 446 INP_RLOCK(inp); 447 ip = mtod(m, struct ip *); 448 ip->ip_tos = inp->inp_ip_tos; 449 if (inp->inp_flags & INP_DONTFRAG) 450 ip->ip_off = htons(IP_DF); 451 else 452 ip->ip_off = htons(0); 453 ip->ip_p = inp->inp_ip_p; 454 ip->ip_len = htons(m->m_pkthdr.len); 455 ip->ip_src = inp->inp_laddr; 456 if (jailed(inp->inp_cred)) { 457 /* 458 * prison_local_ip4() would be good enough but would 459 * let a source of INADDR_ANY pass, which we do not 460 * want to see from jails. We do not go through the 461 * pain of in_pcbladdr() for raw sockets. 462 */ 463 if (ip->ip_src.s_addr == INADDR_ANY) 464 error = prison_get_ip4(inp->inp_cred, 465 &ip->ip_src); 466 else 467 error = prison_local_ip4(inp->inp_cred, 468 &ip->ip_src); 469 if (error != 0) { 470 INP_RUNLOCK(inp); 471 m_freem(m); 472 return (error); 473 } 474 } 475 ip->ip_dst.s_addr = dst; 476 ip->ip_ttl = inp->inp_ip_ttl; 477 } else { 478 if (m->m_pkthdr.len > IP_MAXPACKET) { 479 m_freem(m); 480 return(EMSGSIZE); 481 } 482 INP_RLOCK(inp); 483 ip = mtod(m, struct ip *); 484 error = prison_check_ip4(inp->inp_cred, &ip->ip_src); 485 if (error != 0) { 486 INP_RUNLOCK(inp); 487 m_freem(m); 488 return (error); 489 } 490 491 /* 492 * Don't allow both user specified and setsockopt options, 493 * and don't allow packet length sizes that will crash. 494 */ 495 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 496 || (ip->ip_len > m->m_pkthdr.len) 497 || (ip->ip_len < (ip->ip_hl << 2))) { 498 INP_RUNLOCK(inp); 499 m_freem(m); 500 return (EINVAL); 501 } 502 if (ip->ip_id == 0) 503 ip->ip_id = ip_newid(); 504 505 /* 506 * Applications on raw sockets pass us packets 507 * in host byte order. 508 */ 509 ip->ip_len = htons(ip->ip_len); 510 ip->ip_off = htons(ip->ip_off); 511 512 /* 513 * XXX prevent ip_output from overwriting header fields. 514 */ 515 flags |= IP_RAWOUTPUT; 516 IPSTAT_INC(ips_rawout); 517 } 518 519 if (inp->inp_flags & INP_ONESBCAST) 520 flags |= IP_SENDONES; 521 522 #ifdef MAC 523 mac_inpcb_create_mbuf(inp, m); 524 #endif 525 526 error = ip_output(m, inp->inp_options, NULL, flags, 527 inp->inp_moptions, inp); 528 INP_RUNLOCK(inp); 529 return (error); 530 } 531 532 /* 533 * Raw IP socket option processing. 534 * 535 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 536 * only be created by a privileged process, and as such, socket option 537 * operations to manage system properties on any raw socket were allowed to 538 * take place without explicit additional access control checks. However, 539 * raw sockets can now also be created in jail(), and therefore explicit 540 * checks are now required. Likewise, raw sockets can be used by a process 541 * after it gives up privilege, so some caution is required. For options 542 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 543 * performed in ip_ctloutput() and therefore no check occurs here. 544 * Unilaterally checking priv_check() here breaks normal IP socket option 545 * operations on raw sockets. 546 * 547 * When adding new socket options here, make sure to add access control 548 * checks here as necessary. 549 * 550 * XXX-BZ inp locking? 551 */ 552 int 553 rip_ctloutput(struct socket *so, struct sockopt *sopt) 554 { 555 struct inpcb *inp = sotoinpcb(so); 556 int error, optval; 557 558 if (sopt->sopt_level != IPPROTO_IP) { 559 if ((sopt->sopt_level == SOL_SOCKET) && 560 (sopt->sopt_name == SO_SETFIB)) { 561 inp->inp_inc.inc_fibnum = so->so_fibnum; 562 return (0); 563 } 564 return (EINVAL); 565 } 566 567 error = 0; 568 switch (sopt->sopt_dir) { 569 case SOPT_GET: 570 switch (sopt->sopt_name) { 571 case IP_HDRINCL: 572 optval = inp->inp_flags & INP_HDRINCL; 573 error = sooptcopyout(sopt, &optval, sizeof optval); 574 break; 575 576 case IP_FW3: /* generic ipfw v.3 functions */ 577 case IP_FW_ADD: /* ADD actually returns the body... */ 578 case IP_FW_GET: 579 case IP_FW_TABLE_GETSIZE: 580 case IP_FW_TABLE_LIST: 581 case IP_FW_NAT_GET_CONFIG: 582 case IP_FW_NAT_GET_LOG: 583 if (V_ip_fw_ctl_ptr != NULL) 584 error = V_ip_fw_ctl_ptr(sopt); 585 else 586 error = ENOPROTOOPT; 587 break; 588 589 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 590 case IP_DUMMYNET_GET: 591 if (ip_dn_ctl_ptr != NULL) 592 error = ip_dn_ctl_ptr(sopt); 593 else 594 error = ENOPROTOOPT; 595 break ; 596 597 case MRT_INIT: 598 case MRT_DONE: 599 case MRT_ADD_VIF: 600 case MRT_DEL_VIF: 601 case MRT_ADD_MFC: 602 case MRT_DEL_MFC: 603 case MRT_VERSION: 604 case MRT_ASSERT: 605 case MRT_API_SUPPORT: 606 case MRT_API_CONFIG: 607 case MRT_ADD_BW_UPCALL: 608 case MRT_DEL_BW_UPCALL: 609 error = priv_check(curthread, PRIV_NETINET_MROUTE); 610 if (error != 0) 611 return (error); 612 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 613 EOPNOTSUPP; 614 break; 615 616 default: 617 error = ip_ctloutput(so, sopt); 618 break; 619 } 620 break; 621 622 case SOPT_SET: 623 switch (sopt->sopt_name) { 624 case IP_HDRINCL: 625 error = sooptcopyin(sopt, &optval, sizeof optval, 626 sizeof optval); 627 if (error) 628 break; 629 if (optval) 630 inp->inp_flags |= INP_HDRINCL; 631 else 632 inp->inp_flags &= ~INP_HDRINCL; 633 break; 634 635 case IP_FW3: /* generic ipfw v.3 functions */ 636 case IP_FW_ADD: 637 case IP_FW_DEL: 638 case IP_FW_FLUSH: 639 case IP_FW_ZERO: 640 case IP_FW_RESETLOG: 641 case IP_FW_TABLE_ADD: 642 case IP_FW_TABLE_DEL: 643 case IP_FW_TABLE_FLUSH: 644 case IP_FW_NAT_CFG: 645 case IP_FW_NAT_DEL: 646 if (V_ip_fw_ctl_ptr != NULL) 647 error = V_ip_fw_ctl_ptr(sopt); 648 else 649 error = ENOPROTOOPT; 650 break; 651 652 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 653 case IP_DUMMYNET_CONFIGURE: 654 case IP_DUMMYNET_DEL: 655 case IP_DUMMYNET_FLUSH: 656 if (ip_dn_ctl_ptr != NULL) 657 error = ip_dn_ctl_ptr(sopt); 658 else 659 error = ENOPROTOOPT ; 660 break ; 661 662 case IP_RSVP_ON: 663 error = priv_check(curthread, PRIV_NETINET_MROUTE); 664 if (error != 0) 665 return (error); 666 error = ip_rsvp_init(so); 667 break; 668 669 case IP_RSVP_OFF: 670 error = priv_check(curthread, PRIV_NETINET_MROUTE); 671 if (error != 0) 672 return (error); 673 error = ip_rsvp_done(); 674 break; 675 676 case IP_RSVP_VIF_ON: 677 case IP_RSVP_VIF_OFF: 678 error = priv_check(curthread, PRIV_NETINET_MROUTE); 679 if (error != 0) 680 return (error); 681 error = ip_rsvp_vif ? 682 ip_rsvp_vif(so, sopt) : EINVAL; 683 break; 684 685 case MRT_INIT: 686 case MRT_DONE: 687 case MRT_ADD_VIF: 688 case MRT_DEL_VIF: 689 case MRT_ADD_MFC: 690 case MRT_DEL_MFC: 691 case MRT_VERSION: 692 case MRT_ASSERT: 693 case MRT_API_SUPPORT: 694 case MRT_API_CONFIG: 695 case MRT_ADD_BW_UPCALL: 696 case MRT_DEL_BW_UPCALL: 697 error = priv_check(curthread, PRIV_NETINET_MROUTE); 698 if (error != 0) 699 return (error); 700 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 701 EOPNOTSUPP; 702 break; 703 704 default: 705 error = ip_ctloutput(so, sopt); 706 break; 707 } 708 break; 709 } 710 711 return (error); 712 } 713 714 /* 715 * This function exists solely to receive the PRC_IFDOWN messages which are 716 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 717 * in_ifadown() to remove all routes corresponding to that address. It also 718 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 719 * routes. 720 */ 721 void 722 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 723 { 724 struct in_ifaddr *ia; 725 struct ifnet *ifp; 726 int err; 727 int flags; 728 729 switch (cmd) { 730 case PRC_IFDOWN: 731 IN_IFADDR_RLOCK(); 732 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 733 if (ia->ia_ifa.ifa_addr == sa 734 && (ia->ia_flags & IFA_ROUTE)) { 735 ifa_ref(&ia->ia_ifa); 736 IN_IFADDR_RUNLOCK(); 737 /* 738 * in_ifscrub kills the interface route. 739 */ 740 in_ifscrub(ia->ia_ifp, ia, 0); 741 /* 742 * in_ifadown gets rid of all the rest of the 743 * routes. This is not quite the right thing 744 * to do, but at least if we are running a 745 * routing process they will come back. 746 */ 747 in_ifadown(&ia->ia_ifa, 0); 748 ifa_free(&ia->ia_ifa); 749 break; 750 } 751 } 752 if (ia == NULL) /* If ia matched, already unlocked. */ 753 IN_IFADDR_RUNLOCK(); 754 break; 755 756 case PRC_IFUP: 757 IN_IFADDR_RLOCK(); 758 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 759 if (ia->ia_ifa.ifa_addr == sa) 760 break; 761 } 762 if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { 763 IN_IFADDR_RUNLOCK(); 764 return; 765 } 766 ifa_ref(&ia->ia_ifa); 767 IN_IFADDR_RUNLOCK(); 768 flags = RTF_UP; 769 ifp = ia->ia_ifa.ifa_ifp; 770 771 if ((ifp->if_flags & IFF_LOOPBACK) 772 || (ifp->if_flags & IFF_POINTOPOINT)) 773 flags |= RTF_HOST; 774 775 err = ifa_del_loopback_route((struct ifaddr *)ia, sa); 776 if (err == 0) 777 ia->ia_flags &= ~IFA_RTSELF; 778 779 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 780 if (err == 0) 781 ia->ia_flags |= IFA_ROUTE; 782 783 err = ifa_add_loopback_route((struct ifaddr *)ia, sa); 784 if (err == 0) 785 ia->ia_flags |= IFA_RTSELF; 786 787 ifa_free(&ia->ia_ifa); 788 break; 789 } 790 } 791 792 static int 793 rip_attach(struct socket *so, int proto, struct thread *td) 794 { 795 struct inpcb *inp; 796 int error; 797 798 inp = sotoinpcb(so); 799 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 800 801 error = priv_check(td, PRIV_NETINET_RAW); 802 if (error) 803 return (error); 804 if (proto >= IPPROTO_MAX || proto < 0) 805 return EPROTONOSUPPORT; 806 error = soreserve(so, rip_sendspace, rip_recvspace); 807 if (error) 808 return (error); 809 INP_INFO_WLOCK(&V_ripcbinfo); 810 error = in_pcballoc(so, &V_ripcbinfo); 811 if (error) { 812 INP_INFO_WUNLOCK(&V_ripcbinfo); 813 return (error); 814 } 815 inp = (struct inpcb *)so->so_pcb; 816 inp->inp_vflag |= INP_IPV4; 817 inp->inp_ip_p = proto; 818 inp->inp_ip_ttl = V_ip_defttl; 819 rip_inshash(inp); 820 INP_INFO_WUNLOCK(&V_ripcbinfo); 821 INP_WUNLOCK(inp); 822 return (0); 823 } 824 825 static void 826 rip_detach(struct socket *so) 827 { 828 struct inpcb *inp; 829 830 inp = sotoinpcb(so); 831 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 832 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 833 ("rip_detach: not closed")); 834 835 INP_INFO_WLOCK(&V_ripcbinfo); 836 INP_WLOCK(inp); 837 rip_delhash(inp); 838 if (so == V_ip_mrouter && ip_mrouter_done) 839 ip_mrouter_done(); 840 if (ip_rsvp_force_done) 841 ip_rsvp_force_done(so); 842 if (so == V_ip_rsvpd) 843 ip_rsvp_done(); 844 in_pcbdetach(inp); 845 in_pcbfree(inp); 846 INP_INFO_WUNLOCK(&V_ripcbinfo); 847 } 848 849 static void 850 rip_dodisconnect(struct socket *so, struct inpcb *inp) 851 { 852 struct inpcbinfo *pcbinfo; 853 854 pcbinfo = inp->inp_pcbinfo; 855 INP_INFO_WLOCK(pcbinfo); 856 INP_WLOCK(inp); 857 rip_delhash(inp); 858 inp->inp_faddr.s_addr = INADDR_ANY; 859 rip_inshash(inp); 860 SOCK_LOCK(so); 861 so->so_state &= ~SS_ISCONNECTED; 862 SOCK_UNLOCK(so); 863 INP_WUNLOCK(inp); 864 INP_INFO_WUNLOCK(pcbinfo); 865 } 866 867 static void 868 rip_abort(struct socket *so) 869 { 870 struct inpcb *inp; 871 872 inp = sotoinpcb(so); 873 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 874 875 rip_dodisconnect(so, inp); 876 } 877 878 static void 879 rip_close(struct socket *so) 880 { 881 struct inpcb *inp; 882 883 inp = sotoinpcb(so); 884 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 885 886 rip_dodisconnect(so, inp); 887 } 888 889 static int 890 rip_disconnect(struct socket *so) 891 { 892 struct inpcb *inp; 893 894 if ((so->so_state & SS_ISCONNECTED) == 0) 895 return (ENOTCONN); 896 897 inp = sotoinpcb(so); 898 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 899 900 rip_dodisconnect(so, inp); 901 return (0); 902 } 903 904 static int 905 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 906 { 907 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 908 struct inpcb *inp; 909 int error; 910 911 if (nam->sa_len != sizeof(*addr)) 912 return (EINVAL); 913 914 error = prison_check_ip4(td->td_ucred, &addr->sin_addr); 915 if (error != 0) 916 return (error); 917 918 inp = sotoinpcb(so); 919 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 920 921 if (TAILQ_EMPTY(&V_ifnet) || 922 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 923 (addr->sin_addr.s_addr && 924 (inp->inp_flags & INP_BINDANY) == 0 && 925 ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) 926 return (EADDRNOTAVAIL); 927 928 INP_INFO_WLOCK(&V_ripcbinfo); 929 INP_WLOCK(inp); 930 rip_delhash(inp); 931 inp->inp_laddr = addr->sin_addr; 932 rip_inshash(inp); 933 INP_WUNLOCK(inp); 934 INP_INFO_WUNLOCK(&V_ripcbinfo); 935 return (0); 936 } 937 938 static int 939 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 940 { 941 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 942 struct inpcb *inp; 943 944 if (nam->sa_len != sizeof(*addr)) 945 return (EINVAL); 946 if (TAILQ_EMPTY(&V_ifnet)) 947 return (EADDRNOTAVAIL); 948 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 949 return (EAFNOSUPPORT); 950 951 inp = sotoinpcb(so); 952 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 953 954 INP_INFO_WLOCK(&V_ripcbinfo); 955 INP_WLOCK(inp); 956 rip_delhash(inp); 957 inp->inp_faddr = addr->sin_addr; 958 rip_inshash(inp); 959 soisconnected(so); 960 INP_WUNLOCK(inp); 961 INP_INFO_WUNLOCK(&V_ripcbinfo); 962 return (0); 963 } 964 965 static int 966 rip_shutdown(struct socket *so) 967 { 968 struct inpcb *inp; 969 970 inp = sotoinpcb(so); 971 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 972 973 INP_WLOCK(inp); 974 socantsendmore(so); 975 INP_WUNLOCK(inp); 976 return (0); 977 } 978 979 static int 980 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 981 struct mbuf *control, struct thread *td) 982 { 983 struct inpcb *inp; 984 u_long dst; 985 986 inp = sotoinpcb(so); 987 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 988 989 /* 990 * Note: 'dst' reads below are unlocked. 991 */ 992 if (so->so_state & SS_ISCONNECTED) { 993 if (nam) { 994 m_freem(m); 995 return (EISCONN); 996 } 997 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 998 } else { 999 if (nam == NULL) { 1000 m_freem(m); 1001 return (ENOTCONN); 1002 } 1003 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 1004 } 1005 return (rip_output(m, so, dst)); 1006 } 1007 #endif /* INET */ 1008 1009 static int 1010 rip_pcblist(SYSCTL_HANDLER_ARGS) 1011 { 1012 int error, i, n; 1013 struct inpcb *inp, **inp_list; 1014 inp_gen_t gencnt; 1015 struct xinpgen xig; 1016 1017 /* 1018 * The process of preparing the TCB list is too time-consuming and 1019 * resource-intensive to repeat twice on every request. 1020 */ 1021 if (req->oldptr == 0) { 1022 n = V_ripcbinfo.ipi_count; 1023 n += imax(n / 8, 10); 1024 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 1025 return (0); 1026 } 1027 1028 if (req->newptr != 0) 1029 return (EPERM); 1030 1031 /* 1032 * OK, now we're committed to doing something. 1033 */ 1034 INP_INFO_RLOCK(&V_ripcbinfo); 1035 gencnt = V_ripcbinfo.ipi_gencnt; 1036 n = V_ripcbinfo.ipi_count; 1037 INP_INFO_RUNLOCK(&V_ripcbinfo); 1038 1039 xig.xig_len = sizeof xig; 1040 xig.xig_count = n; 1041 xig.xig_gen = gencnt; 1042 xig.xig_sogen = so_gencnt; 1043 error = SYSCTL_OUT(req, &xig, sizeof xig); 1044 if (error) 1045 return (error); 1046 1047 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1048 if (inp_list == 0) 1049 return (ENOMEM); 1050 1051 INP_INFO_RLOCK(&V_ripcbinfo); 1052 for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; 1053 inp = LIST_NEXT(inp, inp_list)) { 1054 INP_WLOCK(inp); 1055 if (inp->inp_gencnt <= gencnt && 1056 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 1057 in_pcbref(inp); 1058 inp_list[i++] = inp; 1059 } 1060 INP_WUNLOCK(inp); 1061 } 1062 INP_INFO_RUNLOCK(&V_ripcbinfo); 1063 n = i; 1064 1065 error = 0; 1066 for (i = 0; i < n; i++) { 1067 inp = inp_list[i]; 1068 INP_RLOCK(inp); 1069 if (inp->inp_gencnt <= gencnt) { 1070 struct xinpcb xi; 1071 1072 bzero(&xi, sizeof(xi)); 1073 xi.xi_len = sizeof xi; 1074 /* XXX should avoid extra copy */ 1075 bcopy(inp, &xi.xi_inp, sizeof *inp); 1076 if (inp->inp_socket) 1077 sotoxsocket(inp->inp_socket, &xi.xi_socket); 1078 INP_RUNLOCK(inp); 1079 error = SYSCTL_OUT(req, &xi, sizeof xi); 1080 } else 1081 INP_RUNLOCK(inp); 1082 } 1083 INP_INFO_WLOCK(&V_ripcbinfo); 1084 for (i = 0; i < n; i++) { 1085 inp = inp_list[i]; 1086 INP_RLOCK(inp); 1087 if (!in_pcbrele_rlocked(inp)) 1088 INP_RUNLOCK(inp); 1089 } 1090 INP_INFO_WUNLOCK(&V_ripcbinfo); 1091 1092 if (!error) { 1093 /* 1094 * Give the user an updated idea of our state. If the 1095 * generation differs from what we told her before, she knows 1096 * that something happened while we were processing this 1097 * request, and it might be necessary to retry. 1098 */ 1099 INP_INFO_RLOCK(&V_ripcbinfo); 1100 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1101 xig.xig_sogen = so_gencnt; 1102 xig.xig_count = V_ripcbinfo.ipi_count; 1103 INP_INFO_RUNLOCK(&V_ripcbinfo); 1104 error = SYSCTL_OUT(req, &xig, sizeof xig); 1105 } 1106 free(inp_list, M_TEMP); 1107 return (error); 1108 } 1109 1110 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, 1111 CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, 1112 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 1113 1114 #ifdef INET 1115 struct pr_usrreqs rip_usrreqs = { 1116 .pru_abort = rip_abort, 1117 .pru_attach = rip_attach, 1118 .pru_bind = rip_bind, 1119 .pru_connect = rip_connect, 1120 .pru_control = in_control, 1121 .pru_detach = rip_detach, 1122 .pru_disconnect = rip_disconnect, 1123 .pru_peeraddr = in_getpeeraddr, 1124 .pru_send = rip_send, 1125 .pru_shutdown = rip_shutdown, 1126 .pru_sockaddr = in_getsockaddr, 1127 .pru_sosetlabel = in_pcbsosetlabel, 1128 .pru_close = rip_close, 1129 }; 1130 #endif /* INET */ 1131