1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ipsec.h" 36 #include "opt_route.h" 37 38 #include <sys/param.h> 39 #include <sys/jail.h> 40 #include <sys/kernel.h> 41 #include <sys/eventhandler.h> 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/mbuf.h> 45 #include <sys/priv.h> 46 #include <sys/proc.h> 47 #include <sys/protosw.h> 48 #include <sys/rwlock.h> 49 #include <sys/signalvar.h> 50 #include <sys/socket.h> 51 #include <sys/socketvar.h> 52 #include <sys/stdarg.h> 53 #include <sys/sx.h> 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 57 #include <vm/uma.h> 58 59 #include <net/if.h> 60 #include <net/if_var.h> 61 #include <net/route.h> 62 #include <net/route/route_ctl.h> 63 #include <net/vnet.h> 64 65 #include <netinet/in.h> 66 #include <netinet/in_systm.h> 67 #include <netinet/in_fib.h> 68 #include <netinet/in_pcb.h> 69 #include <netinet/in_var.h> 70 #include <netinet/if_ether.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_var.h> 73 #include <netinet/ip_mroute.h> 74 #include <netinet/ip_icmp.h> 75 76 #include <netipsec/ipsec_support.h> 77 78 #include <security/mac/mac_framework.h> 79 80 extern ipproto_input_t *ip_protox[]; 81 82 VNET_DEFINE(int, ip_defttl) = IPDEFTTL; 83 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, 84 &VNET_NAME(ip_defttl), 0, 85 "Maximum TTL on IP packets"); 86 87 VNET_DEFINE(struct inpcbinfo, ripcbinfo); 88 #define V_ripcbinfo VNET(ripcbinfo) 89 90 /* 91 * Control and data hooks for ipfw, dummynet, divert and so on. 92 * The data hooks are not used here but it is convenient 93 * to keep them all in one place. 94 */ 95 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; 96 97 int (*ip_dn_ctl_ptr)(struct sockopt *); 98 int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); 99 void (*ip_divert_ptr)(struct mbuf *, bool); 100 int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); 101 102 #ifdef INET 103 /* 104 * Hooks for multicast routing. They all default to NULL, so leave them not 105 * initialized and rely on BSS being set to 0. 106 */ 107 108 /* 109 * The socket used to communicate with the multicast routing daemon. 110 */ 111 VNET_DEFINE(struct socket *, ip_mrouter); 112 113 /* 114 * The various mrouter and rsvp functions. 115 */ 116 int (*ip_mrouter_set)(struct socket *, struct sockopt *); 117 int (*ip_mrouter_get)(struct socket *, struct sockopt *); 118 int (*ip_mrouter_done)(void); 119 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 120 struct ip_moptions *); 121 int (*mrt_ioctl)(u_long, caddr_t, int); 122 int (*legal_vif_num)(int); 123 u_long (*ip_mcast_src)(int); 124 125 int (*rsvp_input_p)(struct mbuf **, int *, int); 126 int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 127 void (*ip_rsvp_force_done)(struct socket *); 128 #endif /* INET */ 129 130 #define V_rip_bind_all_fibs VNET(rip_bind_all_fibs) 131 VNET_DEFINE(int, rip_bind_all_fibs) = 1; 132 SYSCTL_INT(_net_inet_raw, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, 133 &VNET_NAME(rip_bind_all_fibs), 0, 134 "Bound sockets receive traffic from all FIBs"); 135 136 u_long rip_sendspace = 9216; 137 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 138 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 139 140 u_long rip_recvspace = 9216; 141 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 142 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 143 144 /* 145 * Hash functions 146 */ 147 148 #define INP_PCBHASH_RAW_SIZE 256 149 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 150 (((proto) + (laddr) + (faddr)) % (mask) + 1) 151 152 #ifdef INET 153 static void 154 rip_inshash(struct inpcb *inp) 155 { 156 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 157 struct inpcbhead *pcbhash; 158 int hash; 159 160 INP_HASH_WLOCK_ASSERT(pcbinfo); 161 INP_WLOCK_ASSERT(inp); 162 163 if (inp->inp_ip_p != 0 && 164 inp->inp_laddr.s_addr != INADDR_ANY && 165 inp->inp_faddr.s_addr != INADDR_ANY) { 166 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 167 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 168 } else 169 hash = 0; 170 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 171 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 172 } 173 174 static void 175 rip_delhash(struct inpcb *inp) 176 { 177 178 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 179 INP_WLOCK_ASSERT(inp); 180 181 CK_LIST_REMOVE(inp, inp_hash_exact); 182 } 183 #endif /* INET */ 184 185 INPCBSTORAGE_DEFINE(ripcbstor, inpcb, "rawinp", "ripcb", "rip", "riphash"); 186 187 static void 188 rip_init(void *arg __unused) 189 { 190 191 in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1); 192 } 193 VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL); 194 195 #ifdef VIMAGE 196 static void 197 rip_destroy(void *unused __unused) 198 { 199 200 in_pcbinfo_destroy(&V_ripcbinfo); 201 } 202 VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); 203 #endif 204 205 #ifdef INET 206 static int 207 rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m, 208 struct sockaddr_in *ripsrc) 209 { 210 struct socket *so = inp->inp_socket; 211 struct mbuf *n, *opts = NULL; 212 213 INP_LOCK_ASSERT(inp); 214 215 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 216 /* check AH/ESP integrity. */ 217 if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) 218 return (0); 219 #endif /* IPSEC */ 220 #ifdef MAC 221 if (mac_inpcb_check_deliver(inp, m) != 0) 222 return (0); 223 #endif 224 /* Check the minimum TTL for socket. */ 225 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 226 return (0); 227 228 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) 229 return (0); 230 231 if ((inp->inp_flags & INP_CONTROLOPTS) || 232 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 233 ip_savecontrol(inp, &opts, ip, n); 234 SOCKBUF_LOCK(&so->so_rcv); 235 if (sbappendaddr_locked(&so->so_rcv, 236 (struct sockaddr *)ripsrc, n, opts) == 0) { 237 soroverflow_locked(so); 238 m_freem(n); 239 if (opts) 240 m_freem(opts); 241 return (0); 242 } 243 sorwakeup_locked(so); 244 245 return (1); 246 } 247 248 struct rip_inp_match_ctx { 249 struct ip *ip; 250 int proto; 251 }; 252 253 static bool 254 rip_inp_match1(const struct inpcb *inp, void *v) 255 { 256 struct rip_inp_match_ctx *ctx = v; 257 258 if (inp->inp_ip_p != ctx->proto) 259 return (false); 260 #ifdef INET6 261 /* XXX inp locking */ 262 if ((inp->inp_vflag & INP_IPV4) == 0) 263 return (false); 264 #endif 265 if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr) 266 return (false); 267 if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr) 268 return (false); 269 return (true); 270 } 271 272 static bool 273 rip_inp_match2(const struct inpcb *inp, void *v) 274 { 275 struct rip_inp_match_ctx *ctx = v; 276 277 if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto) 278 return (false); 279 #ifdef INET6 280 /* XXX inp locking */ 281 if ((inp->inp_vflag & INP_IPV4) == 0) 282 return (false); 283 #endif 284 if (!in_nullhost(inp->inp_laddr) && 285 !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst)) 286 return (false); 287 if (!in_nullhost(inp->inp_faddr) && 288 !in_hosteq(inp->inp_faddr, ctx->ip->ip_src)) 289 return (false); 290 return (true); 291 } 292 293 /* 294 * Setup generic address and protocol structures for raw_input routine, then 295 * pass them along with mbuf chain. 296 */ 297 int 298 rip_input(struct mbuf **mp, int *offp, int proto) 299 { 300 struct rip_inp_match_ctx ctx = { 301 .ip = mtod(*mp, struct ip *), 302 .proto = proto, 303 }; 304 struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, 305 INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx); 306 struct ifnet *ifp; 307 struct mbuf *m = *mp; 308 struct inpcb *inp; 309 struct sockaddr_in ripsrc; 310 int appended, fib; 311 312 M_ASSERTPKTHDR(m); 313 314 *mp = NULL; 315 appended = 0; 316 317 bzero(&ripsrc, sizeof(ripsrc)); 318 ripsrc.sin_len = sizeof(ripsrc); 319 ripsrc.sin_family = AF_INET; 320 ripsrc.sin_addr = ctx.ip->ip_src; 321 322 fib = M_GETFIB(m); 323 ifp = m->m_pkthdr.rcvif; 324 325 inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr, 326 ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 327 while ((inp = inp_next(&inpi)) != NULL) { 328 INP_RLOCK_ASSERT(inp); 329 if (jailed_without_vnet(inp->inp_cred) && 330 prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) { 331 /* 332 * XXX: If faddr was bound to multicast group, 333 * jailed raw socket will drop datagram. 334 */ 335 continue; 336 } 337 if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) 338 /* 339 * Sockets bound to a specific FIB can only receive 340 * packets from that FIB. 341 */ 342 continue; 343 appended += rip_append(inp, ctx.ip, m, &ripsrc); 344 } 345 346 inpi.hash = 0; 347 inpi.match = rip_inp_match2; 348 MPASS(inpi.inp == NULL); 349 while ((inp = inp_next(&inpi)) != NULL) { 350 INP_RLOCK_ASSERT(inp); 351 if (jailed_without_vnet(inp->inp_cred) && 352 !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) && 353 prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) 354 /* 355 * Allow raw socket in jail to receive multicast; 356 * assume process had PRIV_NETINET_RAW at attach, 357 * and fall through into normal filter path if so. 358 */ 359 continue; 360 if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) 361 continue; 362 363 /* 364 * If this raw socket has multicast state, and we 365 * have received a multicast, check if this socket 366 * should receive it, as multicast filtering is now 367 * the responsibility of the transport layer. 368 */ 369 if (inp->inp_moptions != NULL && 370 IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) { 371 /* 372 * If the incoming datagram is for IGMP, allow it 373 * through unconditionally to the raw socket. 374 * 375 * In the case of IGMPv2, we may not have explicitly 376 * joined the group, and may have set IFF_ALLMULTI 377 * on the interface. imo_multi_filter() may discard 378 * control traffic we actually need to see. 379 * 380 * Userland multicast routing daemons should continue 381 * filter the control traffic appropriately. 382 */ 383 int blocked; 384 385 blocked = MCAST_PASS; 386 if (proto != IPPROTO_IGMP) { 387 struct sockaddr_in group; 388 389 bzero(&group, sizeof(struct sockaddr_in)); 390 group.sin_len = sizeof(struct sockaddr_in); 391 group.sin_family = AF_INET; 392 group.sin_addr = ctx.ip->ip_dst; 393 394 blocked = imo_multi_filter(inp->inp_moptions, 395 ifp, 396 (struct sockaddr *)&group, 397 (struct sockaddr *)&ripsrc); 398 } 399 400 if (blocked != MCAST_PASS) { 401 IPSTAT_INC(ips_notmember); 402 continue; 403 } 404 } 405 appended += rip_append(inp, ctx.ip, m, &ripsrc); 406 } 407 if (appended == 0 && ip_protox[ctx.ip->ip_p] == rip_input) { 408 IPSTAT_INC(ips_noproto); 409 IPSTAT_DEC(ips_delivered); 410 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); 411 } else 412 m_freem(m); 413 return (IPPROTO_DONE); 414 } 415 416 /* 417 * Generate IP header and pass packet to ip_output. Tack on options user may 418 * have setup with control call. 419 */ 420 static int 421 rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam, 422 struct mbuf *control, struct thread *td) 423 { 424 struct epoch_tracker et; 425 struct ip *ip; 426 struct inpcb *inp; 427 in_addr_t *dst; 428 int error, flags, cnt, hlen; 429 u_char opttype, optlen, *cp; 430 431 inp = sotoinpcb(so); 432 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 433 434 if (control != NULL) { 435 m_freem(control); 436 control = NULL; 437 } 438 439 if (so->so_state & SS_ISCONNECTED) { 440 if (nam) { 441 error = EISCONN; 442 m_freem(m); 443 return (error); 444 } 445 dst = &inp->inp_faddr.s_addr; 446 } else { 447 if (nam == NULL) 448 error = ENOTCONN; 449 else if (nam->sa_family != AF_INET) 450 error = EAFNOSUPPORT; 451 else if (nam->sa_len != sizeof(struct sockaddr_in)) 452 error = EINVAL; 453 else 454 error = 0; 455 if (error != 0) { 456 m_freem(m); 457 return (error); 458 } 459 dst = &((struct sockaddr_in *)nam)->sin_addr.s_addr; 460 } 461 462 flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 463 IP_ALLOWBROADCAST; 464 465 /* 466 * If the user handed us a complete IP packet, use it. Otherwise, 467 * allocate an mbuf for a header and fill it in. 468 */ 469 if ((inp->inp_flags & INP_HDRINCL) == 0) { 470 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 471 m_freem(m); 472 return(EMSGSIZE); 473 } 474 M_PREPEND(m, sizeof(struct ip), M_NOWAIT); 475 if (m == NULL) 476 return(ENOBUFS); 477 478 INP_RLOCK(inp); 479 ip = mtod(m, struct ip *); 480 ip->ip_tos = inp->inp_ip_tos; 481 if (inp->inp_flags & INP_DONTFRAG) 482 ip->ip_off = htons(IP_DF); 483 else 484 ip->ip_off = htons(0); 485 ip->ip_p = inp->inp_ip_p; 486 ip->ip_len = htons(m->m_pkthdr.len); 487 ip->ip_src = inp->inp_laddr; 488 ip->ip_dst.s_addr = *dst; 489 #ifdef ROUTE_MPATH 490 if (CALC_FLOWID_OUTBOUND) { 491 uint32_t hash_type, hash_val; 492 493 hash_val = fib4_calc_software_hash(ip->ip_src, 494 ip->ip_dst, 0, 0, ip->ip_p, &hash_type); 495 m->m_pkthdr.flowid = hash_val; 496 M_HASHTYPE_SET(m, hash_type); 497 flags |= IP_NODEFAULTFLOWID; 498 } 499 #endif 500 if (jailed(inp->inp_cred)) { 501 /* 502 * prison_local_ip4() would be good enough but would 503 * let a source of INADDR_ANY pass, which we do not 504 * want to see from jails. 505 */ 506 if (ip->ip_src.s_addr == INADDR_ANY) { 507 NET_EPOCH_ENTER(et); 508 error = in_pcbladdr(inp, &ip->ip_dst, 509 &ip->ip_src, inp->inp_cred); 510 NET_EPOCH_EXIT(et); 511 } else { 512 error = prison_local_ip4(inp->inp_cred, 513 &ip->ip_src); 514 } 515 if (error != 0) { 516 INP_RUNLOCK(inp); 517 m_freem(m); 518 return (error); 519 } 520 } 521 ip->ip_ttl = inp->inp_ip_ttl; 522 } else { 523 if (m->m_pkthdr.len > IP_MAXPACKET) { 524 m_freem(m); 525 return (EMSGSIZE); 526 } 527 if (m->m_pkthdr.len < sizeof(*ip)) { 528 m_freem(m); 529 return (EINVAL); 530 } 531 m = m_pullup(m, sizeof(*ip)); 532 if (m == NULL) 533 return (ENOMEM); 534 ip = mtod(m, struct ip *); 535 hlen = ip->ip_hl << 2; 536 if (m->m_len < hlen) { 537 m = m_pullup(m, hlen); 538 if (m == NULL) 539 return (EINVAL); 540 ip = mtod(m, struct ip *); 541 } 542 #ifdef ROUTE_MPATH 543 if (CALC_FLOWID_OUTBOUND) { 544 uint32_t hash_type, hash_val; 545 546 hash_val = fib4_calc_software_hash(ip->ip_dst, 547 ip->ip_src, 0, 0, ip->ip_p, &hash_type); 548 m->m_pkthdr.flowid = hash_val; 549 M_HASHTYPE_SET(m, hash_type); 550 flags |= IP_NODEFAULTFLOWID; 551 } 552 #endif 553 INP_RLOCK(inp); 554 /* 555 * Don't allow both user specified and setsockopt options, 556 * and don't allow packet length sizes that will crash. 557 */ 558 if ((hlen < sizeof (*ip)) 559 || ((hlen > sizeof (*ip)) && inp->inp_options) 560 || (ntohs(ip->ip_len) != m->m_pkthdr.len)) { 561 INP_RUNLOCK(inp); 562 m_freem(m); 563 return (EINVAL); 564 } 565 error = prison_check_ip4(inp->inp_cred, &ip->ip_src); 566 if (error != 0) { 567 INP_RUNLOCK(inp); 568 m_freem(m); 569 return (error); 570 } 571 /* 572 * Don't allow IP options which do not have the required 573 * structure as specified in section 3.1 of RFC 791 on 574 * pages 15-23. 575 */ 576 cp = (u_char *)(ip + 1); 577 cnt = hlen - sizeof (struct ip); 578 for (; cnt > 0; cnt -= optlen, cp += optlen) { 579 opttype = cp[IPOPT_OPTVAL]; 580 if (opttype == IPOPT_EOL) 581 break; 582 if (opttype == IPOPT_NOP) { 583 optlen = 1; 584 continue; 585 } 586 if (cnt < IPOPT_OLEN + sizeof(u_char)) { 587 INP_RUNLOCK(inp); 588 m_freem(m); 589 return (EINVAL); 590 } 591 optlen = cp[IPOPT_OLEN]; 592 if (optlen < IPOPT_OLEN + sizeof(u_char) || 593 optlen > cnt) { 594 INP_RUNLOCK(inp); 595 m_freem(m); 596 return (EINVAL); 597 } 598 } 599 /* 600 * This doesn't allow application to specify ID of zero, 601 * but we got this limitation from the beginning of history. 602 */ 603 if (ip->ip_id == 0) 604 ip_fillid(ip, V_ip_random_id); 605 606 /* 607 * XXX prevent ip_output from overwriting header fields. 608 */ 609 flags |= IP_RAWOUTPUT; 610 IPSTAT_INC(ips_rawout); 611 } 612 613 if (inp->inp_flags & INP_ONESBCAST) 614 flags |= IP_SENDONES; 615 616 #ifdef MAC 617 mac_inpcb_create_mbuf(inp, m); 618 #endif 619 620 NET_EPOCH_ENTER(et); 621 error = ip_output(m, inp->inp_options, NULL, flags, 622 inp->inp_moptions, inp); 623 NET_EPOCH_EXIT(et); 624 INP_RUNLOCK(inp); 625 return (error); 626 } 627 628 /* 629 * Raw IP socket option processing. 630 * 631 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 632 * only be created by a privileged process, and as such, socket option 633 * operations to manage system properties on any raw socket were allowed to 634 * take place without explicit additional access control checks. However, 635 * raw sockets can now also be created in jail(), and therefore explicit 636 * checks are now required. Likewise, raw sockets can be used by a process 637 * after it gives up privilege, so some caution is required. For options 638 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 639 * performed in ip_ctloutput() and therefore no check occurs here. 640 * Unilaterally checking priv_check() here breaks normal IP socket option 641 * operations on raw sockets. 642 * 643 * When adding new socket options here, make sure to add access control 644 * checks here as necessary. 645 */ 646 int 647 rip_ctloutput(struct socket *so, struct sockopt *sopt) 648 { 649 struct inpcb *inp = sotoinpcb(so); 650 int error, optval; 651 652 if (sopt->sopt_level != IPPROTO_IP) { 653 if (sopt->sopt_dir == SOPT_SET && 654 sopt->sopt_level == SOL_SOCKET && 655 sopt->sopt_name == SO_SETFIB) 656 return (ip_ctloutput(so, sopt)); 657 return (EINVAL); 658 } 659 660 error = 0; 661 switch (sopt->sopt_dir) { 662 case SOPT_GET: 663 switch (sopt->sopt_name) { 664 case IP_HDRINCL: 665 optval = inp->inp_flags & INP_HDRINCL; 666 error = sooptcopyout(sopt, &optval, sizeof optval); 667 break; 668 669 case IP_FW3: /* generic ipfw v.3 functions */ 670 case IP_FW_ADD: /* ADD actually returns the body... */ 671 case IP_FW_GET: 672 case IP_FW_TABLE_GETSIZE: 673 case IP_FW_TABLE_LIST: 674 case IP_FW_NAT_GET_CONFIG: 675 case IP_FW_NAT_GET_LOG: 676 if (V_ip_fw_ctl_ptr != NULL) 677 error = V_ip_fw_ctl_ptr(sopt); 678 else 679 error = ENOPROTOOPT; 680 break; 681 682 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 683 case IP_DUMMYNET_GET: 684 if (ip_dn_ctl_ptr != NULL) 685 error = ip_dn_ctl_ptr(sopt); 686 else 687 error = ENOPROTOOPT; 688 break ; 689 690 case MRT_INIT: 691 case MRT_DONE: 692 case MRT_ADD_VIF: 693 case MRT_DEL_VIF: 694 case MRT_ADD_MFC: 695 case MRT_DEL_MFC: 696 case MRT_VERSION: 697 case MRT_ASSERT: 698 case MRT_API_SUPPORT: 699 case MRT_API_CONFIG: 700 case MRT_ADD_BW_UPCALL: 701 case MRT_DEL_BW_UPCALL: 702 error = priv_check(curthread, PRIV_NETINET_MROUTE); 703 if (error != 0) 704 return (error); 705 if (inp->inp_ip_p != IPPROTO_IGMP) 706 return (EOPNOTSUPP); 707 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 708 EOPNOTSUPP; 709 break; 710 711 default: 712 error = ip_ctloutput(so, sopt); 713 break; 714 } 715 break; 716 717 case SOPT_SET: 718 switch (sopt->sopt_name) { 719 case IP_HDRINCL: 720 error = sooptcopyin(sopt, &optval, sizeof optval, 721 sizeof optval); 722 if (error) 723 break; 724 INP_WLOCK(inp); 725 if (optval) 726 inp->inp_flags |= INP_HDRINCL; 727 else 728 inp->inp_flags &= ~INP_HDRINCL; 729 INP_WUNLOCK(inp); 730 break; 731 732 case IP_FW3: /* generic ipfw v.3 functions */ 733 case IP_FW_ADD: 734 case IP_FW_DEL: 735 case IP_FW_FLUSH: 736 case IP_FW_ZERO: 737 case IP_FW_RESETLOG: 738 case IP_FW_TABLE_ADD: 739 case IP_FW_TABLE_DEL: 740 case IP_FW_TABLE_FLUSH: 741 case IP_FW_NAT_CFG: 742 case IP_FW_NAT_DEL: 743 if (V_ip_fw_ctl_ptr != NULL) 744 error = V_ip_fw_ctl_ptr(sopt); 745 else 746 error = ENOPROTOOPT; 747 break; 748 749 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 750 case IP_DUMMYNET_CONFIGURE: 751 case IP_DUMMYNET_DEL: 752 case IP_DUMMYNET_FLUSH: 753 if (ip_dn_ctl_ptr != NULL) 754 error = ip_dn_ctl_ptr(sopt); 755 else 756 error = ENOPROTOOPT ; 757 break ; 758 759 case IP_RSVP_ON: 760 error = priv_check(curthread, PRIV_NETINET_MROUTE); 761 if (error != 0) 762 return (error); 763 if (inp->inp_ip_p != IPPROTO_RSVP) 764 return (EOPNOTSUPP); 765 error = ip_rsvp_init(so); 766 break; 767 768 case IP_RSVP_OFF: 769 error = priv_check(curthread, PRIV_NETINET_MROUTE); 770 if (error != 0) 771 return (error); 772 error = ip_rsvp_done(); 773 break; 774 775 case IP_RSVP_VIF_ON: 776 case IP_RSVP_VIF_OFF: 777 error = priv_check(curthread, PRIV_NETINET_MROUTE); 778 if (error != 0) 779 return (error); 780 if (inp->inp_ip_p != IPPROTO_RSVP) 781 return (EOPNOTSUPP); 782 error = ip_rsvp_vif ? 783 ip_rsvp_vif(so, sopt) : EINVAL; 784 break; 785 786 case MRT_INIT: 787 case MRT_DONE: 788 case MRT_ADD_VIF: 789 case MRT_DEL_VIF: 790 case MRT_ADD_MFC: 791 case MRT_DEL_MFC: 792 case MRT_VERSION: 793 case MRT_ASSERT: 794 case MRT_API_SUPPORT: 795 case MRT_API_CONFIG: 796 case MRT_ADD_BW_UPCALL: 797 case MRT_DEL_BW_UPCALL: 798 error = priv_check(curthread, PRIV_NETINET_MROUTE); 799 if (error != 0) 800 return (error); 801 if (inp->inp_ip_p != IPPROTO_IGMP) 802 return (EOPNOTSUPP); 803 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 804 EOPNOTSUPP; 805 break; 806 807 default: 808 error = ip_ctloutput(so, sopt); 809 break; 810 } 811 break; 812 } 813 814 return (error); 815 } 816 817 void 818 rip_ctlinput(struct icmp *icmp) 819 { 820 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 821 if (IPSEC_ENABLED(ipv4)) 822 IPSEC_CTLINPUT(ipv4, icmp); 823 #endif 824 } 825 826 static int 827 rip_attach(struct socket *so, int proto, struct thread *td) 828 { 829 struct inpcb *inp; 830 int error; 831 832 inp = sotoinpcb(so); 833 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 834 835 error = priv_check(td, PRIV_NETINET_RAW); 836 if (error) 837 return (error); 838 if (proto >= IPPROTO_MAX || proto < 0) 839 return EPROTONOSUPPORT; 840 error = soreserve(so, rip_sendspace, rip_recvspace); 841 if (error) 842 return (error); 843 error = in_pcballoc(so, &V_ripcbinfo); 844 if (error) 845 return (error); 846 inp = (struct inpcb *)so->so_pcb; 847 inp->inp_ip_p = proto; 848 inp->inp_ip_ttl = V_ip_defttl; 849 INP_HASH_WLOCK(&V_ripcbinfo); 850 rip_inshash(inp); 851 INP_HASH_WUNLOCK(&V_ripcbinfo); 852 INP_WUNLOCK(inp); 853 return (0); 854 } 855 856 static void 857 rip_detach(struct socket *so) 858 { 859 struct inpcb *inp; 860 861 inp = sotoinpcb(so); 862 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 863 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 864 ("rip_detach: not closed")); 865 866 /* Disable mrouter first */ 867 if (so == V_ip_mrouter && ip_mrouter_done) 868 ip_mrouter_done(); 869 870 INP_WLOCK(inp); 871 INP_HASH_WLOCK(&V_ripcbinfo); 872 rip_delhash(inp); 873 INP_HASH_WUNLOCK(&V_ripcbinfo); 874 875 if (ip_rsvp_force_done) 876 ip_rsvp_force_done(so); 877 if (so == V_ip_rsvpd) 878 ip_rsvp_done(); 879 in_pcbfree(inp); 880 } 881 882 static void 883 rip_dodisconnect(struct socket *so, struct inpcb *inp) 884 { 885 struct inpcbinfo *pcbinfo; 886 887 pcbinfo = inp->inp_pcbinfo; 888 INP_WLOCK(inp); 889 INP_HASH_WLOCK(pcbinfo); 890 rip_delhash(inp); 891 inp->inp_faddr.s_addr = INADDR_ANY; 892 rip_inshash(inp); 893 INP_HASH_WUNLOCK(pcbinfo); 894 SOCK_LOCK(so); 895 so->so_state &= ~SS_ISCONNECTED; 896 SOCK_UNLOCK(so); 897 INP_WUNLOCK(inp); 898 } 899 900 static void 901 rip_abort(struct socket *so) 902 { 903 struct inpcb *inp; 904 905 inp = sotoinpcb(so); 906 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 907 908 rip_dodisconnect(so, inp); 909 } 910 911 static void 912 rip_close(struct socket *so) 913 { 914 struct inpcb *inp; 915 916 inp = sotoinpcb(so); 917 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 918 919 rip_dodisconnect(so, inp); 920 } 921 922 static int 923 rip_disconnect(struct socket *so) 924 { 925 struct inpcb *inp; 926 927 if ((so->so_state & SS_ISCONNECTED) == 0) 928 return (ENOTCONN); 929 930 inp = sotoinpcb(so); 931 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 932 933 rip_dodisconnect(so, inp); 934 return (0); 935 } 936 937 static int 938 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 939 { 940 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 941 struct inpcb *inp; 942 int error; 943 944 if (nam->sa_family != AF_INET) 945 return (EAFNOSUPPORT); 946 if (nam->sa_len != sizeof(*addr)) 947 return (EINVAL); 948 949 error = prison_check_ip4(td->td_ucred, &addr->sin_addr); 950 if (error != 0) 951 return (error); 952 953 inp = sotoinpcb(so); 954 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 955 956 if (CK_STAILQ_EMPTY(&V_ifnet) || 957 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 958 (addr->sin_addr.s_addr && 959 (inp->inp_flags & INP_BINDANY) == 0 && 960 ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) 961 return (EADDRNOTAVAIL); 962 963 INP_WLOCK(inp); 964 INP_HASH_WLOCK(&V_ripcbinfo); 965 rip_delhash(inp); 966 inp->inp_laddr = addr->sin_addr; 967 rip_inshash(inp); 968 INP_HASH_WUNLOCK(&V_ripcbinfo); 969 INP_WUNLOCK(inp); 970 return (0); 971 } 972 973 static int 974 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 975 { 976 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 977 struct inpcb *inp; 978 979 if (nam->sa_len != sizeof(*addr)) 980 return (EINVAL); 981 if (CK_STAILQ_EMPTY(&V_ifnet)) 982 return (EADDRNOTAVAIL); 983 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 984 return (EAFNOSUPPORT); 985 986 inp = sotoinpcb(so); 987 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 988 989 INP_WLOCK(inp); 990 INP_HASH_WLOCK(&V_ripcbinfo); 991 rip_delhash(inp); 992 inp->inp_faddr = addr->sin_addr; 993 rip_inshash(inp); 994 INP_HASH_WUNLOCK(&V_ripcbinfo); 995 soisconnected(so); 996 INP_WUNLOCK(inp); 997 return (0); 998 } 999 1000 static int 1001 rip_shutdown(struct socket *so, enum shutdown_how how) 1002 { 1003 1004 SOCK_LOCK(so); 1005 if (!(so->so_state & SS_ISCONNECTED)) { 1006 SOCK_UNLOCK(so); 1007 return (ENOTCONN); 1008 } 1009 SOCK_UNLOCK(so); 1010 1011 switch (how) { 1012 case SHUT_RD: 1013 sorflush(so); 1014 break; 1015 case SHUT_RDWR: 1016 sorflush(so); 1017 /* FALLTHROUGH */ 1018 case SHUT_WR: 1019 socantsendmore(so); 1020 } 1021 1022 return (0); 1023 } 1024 #endif /* INET */ 1025 1026 static int 1027 rip_pcblist(SYSCTL_HANDLER_ARGS) 1028 { 1029 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo, 1030 INPLOOKUP_RLOCKPCB); 1031 struct xinpgen xig; 1032 struct inpcb *inp; 1033 int error; 1034 1035 if (req->newptr != 0) 1036 return (EPERM); 1037 1038 if (req->oldptr == 0) { 1039 int n; 1040 1041 n = V_ripcbinfo.ipi_count; 1042 n += imax(n / 8, 10); 1043 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 1044 return (0); 1045 } 1046 1047 if ((error = sysctl_wire_old_buffer(req, 0)) != 0) 1048 return (error); 1049 1050 bzero(&xig, sizeof(xig)); 1051 xig.xig_len = sizeof xig; 1052 xig.xig_count = V_ripcbinfo.ipi_count; 1053 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1054 xig.xig_sogen = so_gencnt; 1055 error = SYSCTL_OUT(req, &xig, sizeof xig); 1056 if (error) 1057 return (error); 1058 1059 while ((inp = inp_next(&inpi)) != NULL) { 1060 if (inp->inp_gencnt <= xig.xig_gen && 1061 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 1062 struct xinpcb xi; 1063 1064 in_pcbtoxinpcb(inp, &xi); 1065 error = SYSCTL_OUT(req, &xi, sizeof xi); 1066 if (error) { 1067 INP_RUNLOCK(inp); 1068 break; 1069 } 1070 } 1071 } 1072 1073 if (!error) { 1074 /* 1075 * Give the user an updated idea of our state. If the 1076 * generation differs from what we told her before, she knows 1077 * that something happened while we were processing this 1078 * request, and it might be necessary to retry. 1079 */ 1080 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1081 xig.xig_sogen = so_gencnt; 1082 xig.xig_count = V_ripcbinfo.ipi_count; 1083 error = SYSCTL_OUT(req, &xig, sizeof xig); 1084 } 1085 1086 return (error); 1087 } 1088 1089 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, 1090 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1091 rip_pcblist, "S,xinpcb", 1092 "List of active raw IP sockets"); 1093 1094 #ifdef INET 1095 struct protosw rip_protosw = { 1096 .pr_type = SOCK_RAW, 1097 .pr_flags = PR_ATOMIC|PR_ADDR, 1098 .pr_ctloutput = rip_ctloutput, 1099 .pr_abort = rip_abort, 1100 .pr_attach = rip_attach, 1101 .pr_bind = rip_bind, 1102 .pr_connect = rip_connect, 1103 .pr_control = in_control, 1104 .pr_detach = rip_detach, 1105 .pr_disconnect = rip_disconnect, 1106 .pr_peeraddr = in_getpeeraddr, 1107 .pr_send = rip_send, 1108 .pr_shutdown = rip_shutdown, 1109 .pr_sockaddr = in_getsockaddr, 1110 .pr_sosetlabel = in_pcbsosetlabel, 1111 .pr_close = rip_close 1112 }; 1113 #endif /* INET */ 1114