1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ipsec.h" 36 #include "opt_route.h" 37 38 #include <sys/param.h> 39 #include <sys/jail.h> 40 #include <sys/kernel.h> 41 #include <sys/eventhandler.h> 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/mbuf.h> 45 #include <sys/priv.h> 46 #include <sys/proc.h> 47 #include <sys/protosw.h> 48 #include <sys/rwlock.h> 49 #include <sys/signalvar.h> 50 #include <sys/socket.h> 51 #include <sys/socketvar.h> 52 #include <sys/stdarg.h> 53 #include <sys/sx.h> 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 57 #include <vm/uma.h> 58 59 #include <net/if.h> 60 #include <net/if_var.h> 61 #include <net/route.h> 62 #include <net/route/route_ctl.h> 63 #include <net/vnet.h> 64 65 #include <netinet/in.h> 66 #include <netinet/in_systm.h> 67 #include <netinet/in_fib.h> 68 #include <netinet/in_pcb.h> 69 #include <netinet/in_var.h> 70 #include <netinet/if_ether.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_var.h> 73 #include <netinet/ip_mroute.h> 74 #include <netinet/ip_icmp.h> 75 76 #include <netipsec/ipsec_support.h> 77 78 #include <security/mac/mac_framework.h> 79 80 extern ipproto_input_t *ip_protox[]; 81 82 VNET_DEFINE(int, ip_defttl) = IPDEFTTL; 83 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, 84 &VNET_NAME(ip_defttl), 0, 85 "Maximum TTL on IP packets"); 86 87 VNET_DEFINE(struct inpcbinfo, ripcbinfo); 88 #define V_ripcbinfo VNET(ripcbinfo) 89 90 /* 91 * Control and data hooks for ipfw, dummynet, divert and so on. 92 * The data hooks are not used here but it is convenient 93 * to keep them all in one place. 94 */ 95 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; 96 97 int (*ip_dn_ctl_ptr)(struct sockopt *); 98 int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); 99 void (*ip_divert_ptr)(struct mbuf *, bool); 100 int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); 101 102 #ifdef INET 103 /* 104 * Hooks for multicast routing. They all default to NULL, so leave them not 105 * initialized and rely on BSS being set to 0. 106 */ 107 108 /* 109 * The socket used to communicate with the multicast routing daemon. 110 */ 111 VNET_DEFINE(struct socket *, ip_mrouter); 112 113 /* 114 * The various mrouter and rsvp functions. 115 */ 116 int (*ip_mrouter_set)(struct socket *, struct sockopt *); 117 int (*ip_mrouter_get)(struct socket *, struct sockopt *); 118 int (*ip_mrouter_done)(void); 119 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 120 struct ip_moptions *); 121 int (*mrt_ioctl)(u_long, caddr_t, int); 122 int (*legal_vif_num)(int); 123 u_long (*ip_mcast_src)(int); 124 125 int (*rsvp_input_p)(struct mbuf **, int *, int); 126 int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 127 void (*ip_rsvp_force_done)(struct socket *); 128 #endif /* INET */ 129 130 #define V_rip_bind_all_fibs VNET(rip_bind_all_fibs) 131 VNET_DEFINE(int, rip_bind_all_fibs) = 1; 132 SYSCTL_INT(_net_inet_raw, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, 133 &VNET_NAME(rip_bind_all_fibs), 0, 134 "Bound sockets receive traffic from all FIBs"); 135 136 u_long rip_sendspace = 9216; 137 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 138 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 139 140 u_long rip_recvspace = 9216; 141 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 142 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 143 144 /* 145 * Hash functions 146 */ 147 148 #define INP_PCBHASH_RAW_SIZE 256 149 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 150 (((proto) + (laddr) + (faddr)) % (mask) + 1) 151 152 #ifdef INET 153 static void 154 rip_inshash(struct inpcb *inp) 155 { 156 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 157 struct inpcbhead *pcbhash; 158 int hash; 159 160 INP_HASH_WLOCK_ASSERT(pcbinfo); 161 INP_WLOCK_ASSERT(inp); 162 163 if (inp->inp_ip_p != 0 && 164 inp->inp_laddr.s_addr != INADDR_ANY && 165 inp->inp_faddr.s_addr != INADDR_ANY) { 166 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 167 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 168 } else 169 hash = 0; 170 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 171 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 172 } 173 174 static void 175 rip_delhash(struct inpcb *inp) 176 { 177 178 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 179 INP_WLOCK_ASSERT(inp); 180 181 CK_LIST_REMOVE(inp, inp_hash_exact); 182 } 183 #endif /* INET */ 184 185 INPCBSTORAGE_DEFINE(ripcbstor, inpcb, "rawinp", "ripcb", "rip", "riphash"); 186 187 static void 188 rip_init(void *arg __unused) 189 { 190 191 in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1); 192 } 193 VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL); 194 195 #ifdef VIMAGE 196 static void 197 rip_destroy(void *unused __unused) 198 { 199 200 in_pcbinfo_destroy(&V_ripcbinfo); 201 } 202 VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); 203 #endif 204 205 #ifdef INET 206 static int 207 rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m, 208 struct sockaddr_in *ripsrc) 209 { 210 struct socket *so = inp->inp_socket; 211 struct mbuf *n, *opts = NULL; 212 213 INP_LOCK_ASSERT(inp); 214 215 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 216 /* check AH/ESP integrity. */ 217 if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) 218 return (0); 219 #endif /* IPSEC */ 220 #ifdef MAC 221 if (mac_inpcb_check_deliver(inp, m) != 0) 222 return (0); 223 #endif 224 /* Check the minimum TTL for socket. */ 225 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 226 return (0); 227 228 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) 229 return (0); 230 231 if ((inp->inp_flags & INP_CONTROLOPTS) || 232 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 233 ip_savecontrol(inp, &opts, ip, n); 234 SOCKBUF_LOCK(&so->so_rcv); 235 if (sbappendaddr_locked(&so->so_rcv, 236 (struct sockaddr *)ripsrc, n, opts) == 0) { 237 soroverflow_locked(so); 238 m_freem(n); 239 if (opts) 240 m_freem(opts); 241 return (0); 242 } 243 sorwakeup_locked(so); 244 245 return (1); 246 } 247 248 struct rip_inp_match_ctx { 249 struct ip *ip; 250 int proto; 251 }; 252 253 static bool 254 rip_inp_match1(const struct inpcb *inp, void *v) 255 { 256 struct rip_inp_match_ctx *ctx = v; 257 258 if (inp->inp_ip_p != ctx->proto) 259 return (false); 260 #ifdef INET6 261 /* XXX inp locking */ 262 if ((inp->inp_vflag & INP_IPV4) == 0) 263 return (false); 264 #endif 265 if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr) 266 return (false); 267 if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr) 268 return (false); 269 return (true); 270 } 271 272 static bool 273 rip_inp_match2(const struct inpcb *inp, void *v) 274 { 275 struct rip_inp_match_ctx *ctx = v; 276 277 if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto) 278 return (false); 279 #ifdef INET6 280 /* XXX inp locking */ 281 if ((inp->inp_vflag & INP_IPV4) == 0) 282 return (false); 283 #endif 284 if (!in_nullhost(inp->inp_laddr) && 285 !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst)) 286 return (false); 287 if (!in_nullhost(inp->inp_faddr) && 288 !in_hosteq(inp->inp_faddr, ctx->ip->ip_src)) 289 return (false); 290 return (true); 291 } 292 293 /* 294 * Setup generic address and protocol structures for raw_input routine, then 295 * pass them along with mbuf chain. 296 */ 297 int 298 rip_input(struct mbuf **mp, int *offp, int proto) 299 { 300 struct rip_inp_match_ctx ctx = { 301 .ip = mtod(*mp, struct ip *), 302 .proto = proto, 303 }; 304 struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, 305 INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx); 306 struct ifnet *ifp; 307 struct mbuf *m = *mp; 308 struct inpcb *inp; 309 struct sockaddr_in ripsrc; 310 int appended, fib; 311 312 M_ASSERTPKTHDR(m); 313 314 *mp = NULL; 315 appended = 0; 316 317 bzero(&ripsrc, sizeof(ripsrc)); 318 ripsrc.sin_len = sizeof(ripsrc); 319 ripsrc.sin_family = AF_INET; 320 ripsrc.sin_addr = ctx.ip->ip_src; 321 322 fib = M_GETFIB(m); 323 ifp = m->m_pkthdr.rcvif; 324 325 inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr, 326 ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 327 while ((inp = inp_next(&inpi)) != NULL) { 328 INP_RLOCK_ASSERT(inp); 329 if (jailed_without_vnet(inp->inp_cred) && 330 prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) { 331 /* 332 * XXX: If faddr was bound to multicast group, 333 * jailed raw socket will drop datagram. 334 */ 335 continue; 336 } 337 if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) 338 /* 339 * Sockets bound to a specific FIB can only receive 340 * packets from that FIB. 341 */ 342 continue; 343 appended += rip_append(inp, ctx.ip, m, &ripsrc); 344 } 345 346 inpi.hash = 0; 347 inpi.match = rip_inp_match2; 348 MPASS(inpi.inp == NULL); 349 while ((inp = inp_next(&inpi)) != NULL) { 350 INP_RLOCK_ASSERT(inp); 351 if (jailed_without_vnet(inp->inp_cred) && 352 !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) && 353 prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) 354 /* 355 * Allow raw socket in jail to receive multicast; 356 * assume process had PRIV_NETINET_RAW at attach, 357 * and fall through into normal filter path if so. 358 */ 359 continue; 360 if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) 361 continue; 362 363 /* 364 * If this raw socket has multicast state, and we 365 * have received a multicast, check if this socket 366 * should receive it, as multicast filtering is now 367 * the responsibility of the transport layer. 368 */ 369 if (inp->inp_moptions != NULL && 370 IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) { 371 /* 372 * If the incoming datagram is for IGMP, allow it 373 * through unconditionally to the raw socket. 374 * 375 * In the case of IGMPv2, we may not have explicitly 376 * joined the group, and may have set IFF_ALLMULTI 377 * on the interface. imo_multi_filter() may discard 378 * control traffic we actually need to see. 379 * 380 * Userland multicast routing daemons should continue 381 * filter the control traffic appropriately. 382 */ 383 int blocked; 384 385 blocked = MCAST_PASS; 386 if (proto != IPPROTO_IGMP) { 387 struct sockaddr_in group; 388 389 bzero(&group, sizeof(struct sockaddr_in)); 390 group.sin_len = sizeof(struct sockaddr_in); 391 group.sin_family = AF_INET; 392 group.sin_addr = ctx.ip->ip_dst; 393 394 blocked = imo_multi_filter(inp->inp_moptions, 395 ifp, 396 (struct sockaddr *)&group, 397 (struct sockaddr *)&ripsrc); 398 } 399 400 if (blocked != MCAST_PASS) { 401 IPSTAT_INC(ips_notmember); 402 continue; 403 } 404 } 405 appended += rip_append(inp, ctx.ip, m, &ripsrc); 406 } 407 if (appended == 0 && ip_protox[ctx.ip->ip_p] == rip_input) { 408 IPSTAT_INC(ips_noproto); 409 IPSTAT_DEC(ips_delivered); 410 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); 411 } else 412 m_freem(m); 413 return (IPPROTO_DONE); 414 } 415 416 /* 417 * Generate IP header and pass packet to ip_output. Tack on options user may 418 * have setup with control call. 419 */ 420 static int 421 rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam, 422 struct mbuf *control, struct thread *td) 423 { 424 struct epoch_tracker et; 425 struct ip *ip; 426 struct inpcb *inp; 427 in_addr_t *dst; 428 int error, flags, cnt, hlen; 429 u_char opttype, optlen, *cp; 430 431 inp = sotoinpcb(so); 432 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 433 434 if (control != NULL) { 435 m_freem(control); 436 control = NULL; 437 } 438 439 if (so->so_state & SS_ISCONNECTED) { 440 if (nam) { 441 error = EISCONN; 442 m_freem(m); 443 return (error); 444 } 445 dst = &inp->inp_faddr.s_addr; 446 } else { 447 if (nam == NULL) 448 error = ENOTCONN; 449 else if (nam->sa_family != AF_INET) 450 error = EAFNOSUPPORT; 451 else if (nam->sa_len != sizeof(struct sockaddr_in)) 452 error = EINVAL; 453 else 454 error = 0; 455 if (error != 0) { 456 m_freem(m); 457 return (error); 458 } 459 dst = &((struct sockaddr_in *)nam)->sin_addr.s_addr; 460 } 461 462 flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 463 IP_ALLOWBROADCAST; 464 465 /* 466 * If the user handed us a complete IP packet, use it. Otherwise, 467 * allocate an mbuf for a header and fill it in. 468 */ 469 if ((inp->inp_flags & INP_HDRINCL) == 0) { 470 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 471 m_freem(m); 472 return(EMSGSIZE); 473 } 474 M_PREPEND(m, sizeof(struct ip), M_NOWAIT); 475 if (m == NULL) 476 return(ENOBUFS); 477 478 INP_RLOCK(inp); 479 ip = mtod(m, struct ip *); 480 ip->ip_tos = inp->inp_ip_tos; 481 if (inp->inp_flags & INP_DONTFRAG) 482 ip->ip_off = htons(IP_DF); 483 else 484 ip->ip_off = htons(0); 485 ip->ip_p = inp->inp_ip_p; 486 ip->ip_len = htons(m->m_pkthdr.len); 487 ip->ip_src = inp->inp_laddr; 488 ip->ip_dst.s_addr = *dst; 489 #ifdef ROUTE_MPATH 490 if (CALC_FLOWID_OUTBOUND) { 491 uint32_t hash_type, hash_val; 492 493 hash_val = fib4_calc_software_hash(ip->ip_src, 494 ip->ip_dst, 0, 0, ip->ip_p, &hash_type); 495 m->m_pkthdr.flowid = hash_val; 496 M_HASHTYPE_SET(m, hash_type); 497 flags |= IP_NODEFAULTFLOWID; 498 } 499 #endif 500 if (jailed(inp->inp_cred)) { 501 /* 502 * prison_local_ip4() would be good enough but would 503 * let a source of INADDR_ANY pass, which we do not 504 * want to see from jails. 505 */ 506 if (ip->ip_src.s_addr == INADDR_ANY) { 507 NET_EPOCH_ENTER(et); 508 error = in_pcbladdr(inp, &ip->ip_dst, 509 &ip->ip_src, inp->inp_cred); 510 NET_EPOCH_EXIT(et); 511 } else { 512 error = prison_local_ip4(inp->inp_cred, 513 &ip->ip_src); 514 } 515 if (error != 0) { 516 INP_RUNLOCK(inp); 517 m_freem(m); 518 return (error); 519 } 520 } 521 ip->ip_ttl = inp->inp_ip_ttl; 522 } else { 523 if (m->m_pkthdr.len > IP_MAXPACKET) { 524 m_freem(m); 525 return (EMSGSIZE); 526 } 527 if (m->m_pkthdr.len < sizeof(*ip)) { 528 m_freem(m); 529 return (EINVAL); 530 } 531 m = m_pullup(m, sizeof(*ip)); 532 if (m == NULL) 533 return (ENOMEM); 534 ip = mtod(m, struct ip *); 535 hlen = ip->ip_hl << 2; 536 if (m->m_len < hlen) { 537 m = m_pullup(m, hlen); 538 if (m == NULL) 539 return (EINVAL); 540 ip = mtod(m, struct ip *); 541 } 542 #ifdef ROUTE_MPATH 543 if (CALC_FLOWID_OUTBOUND) { 544 uint32_t hash_type, hash_val; 545 546 hash_val = fib4_calc_software_hash(ip->ip_dst, 547 ip->ip_src, 0, 0, ip->ip_p, &hash_type); 548 m->m_pkthdr.flowid = hash_val; 549 M_HASHTYPE_SET(m, hash_type); 550 flags |= IP_NODEFAULTFLOWID; 551 } 552 #endif 553 INP_RLOCK(inp); 554 /* 555 * Don't allow both user specified and setsockopt options, 556 * and don't allow packet length sizes that will crash. 557 */ 558 if ((hlen < sizeof (*ip)) 559 || ((hlen > sizeof (*ip)) && inp->inp_options) 560 || (ntohs(ip->ip_len) != m->m_pkthdr.len)) { 561 INP_RUNLOCK(inp); 562 m_freem(m); 563 return (EINVAL); 564 } 565 error = prison_check_ip4(inp->inp_cred, &ip->ip_src); 566 if (error != 0) { 567 INP_RUNLOCK(inp); 568 m_freem(m); 569 return (error); 570 } 571 /* 572 * Don't allow IP options which do not have the required 573 * structure as specified in section 3.1 of RFC 791 on 574 * pages 15-23. 575 */ 576 cp = (u_char *)(ip + 1); 577 cnt = hlen - sizeof (struct ip); 578 for (; cnt > 0; cnt -= optlen, cp += optlen) { 579 opttype = cp[IPOPT_OPTVAL]; 580 if (opttype == IPOPT_EOL) 581 break; 582 if (opttype == IPOPT_NOP) { 583 optlen = 1; 584 continue; 585 } 586 if (cnt < IPOPT_OLEN + sizeof(u_char)) { 587 INP_RUNLOCK(inp); 588 m_freem(m); 589 return (EINVAL); 590 } 591 optlen = cp[IPOPT_OLEN]; 592 if (optlen < IPOPT_OLEN + sizeof(u_char) || 593 optlen > cnt) { 594 INP_RUNLOCK(inp); 595 m_freem(m); 596 return (EINVAL); 597 } 598 } 599 /* 600 * This doesn't allow application to specify ID of zero, 601 * but we got this limitation from the beginning of history. 602 */ 603 if (ip->ip_id == 0) 604 ip_fillid(ip, V_ip_random_id); 605 606 /* 607 * XXX prevent ip_output from overwriting header fields. 608 */ 609 flags |= IP_RAWOUTPUT; 610 IPSTAT_INC(ips_rawout); 611 } 612 613 if (inp->inp_flags & INP_ONESBCAST) 614 flags |= IP_SENDONES; 615 616 #ifdef MAC 617 mac_inpcb_create_mbuf(inp, m); 618 #endif 619 620 NET_EPOCH_ENTER(et); 621 error = ip_output(m, inp->inp_options, NULL, flags, 622 inp->inp_moptions, inp); 623 NET_EPOCH_EXIT(et); 624 INP_RUNLOCK(inp); 625 return (error); 626 } 627 628 /* 629 * Raw IP socket option processing. 630 * 631 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 632 * only be created by a privileged process, and as such, socket option 633 * operations to manage system properties on any raw socket were allowed to 634 * take place without explicit additional access control checks. However, 635 * raw sockets can now also be created in jail(), and therefore explicit 636 * checks are now required. Likewise, raw sockets can be used by a process 637 * after it gives up privilege, so some caution is required. For options 638 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 639 * performed in ip_ctloutput() and therefore no check occurs here. 640 * Unilaterally checking priv_check() here breaks normal IP socket option 641 * operations on raw sockets. 642 * 643 * When adding new socket options here, make sure to add access control 644 * checks here as necessary. 645 */ 646 int 647 rip_ctloutput(struct socket *so, struct sockopt *sopt) 648 { 649 struct inpcb *inp = sotoinpcb(so); 650 int error, optval; 651 652 if (sopt->sopt_level != IPPROTO_IP) { 653 if (sopt->sopt_dir == SOPT_SET && 654 sopt->sopt_level == SOL_SOCKET && 655 sopt->sopt_name == SO_SETFIB) 656 return (ip_ctloutput(so, sopt)); 657 return (EINVAL); 658 } 659 660 error = 0; 661 switch (sopt->sopt_dir) { 662 case SOPT_GET: 663 switch (sopt->sopt_name) { 664 case IP_HDRINCL: 665 optval = inp->inp_flags & INP_HDRINCL; 666 error = sooptcopyout(sopt, &optval, sizeof optval); 667 break; 668 669 case IP_FW3: /* generic ipfw v.3 functions */ 670 case IP_FW_ADD: /* ADD actually returns the body... */ 671 case IP_FW_GET: 672 case IP_FW_TABLE_GETSIZE: 673 case IP_FW_TABLE_LIST: 674 case IP_FW_NAT_GET_CONFIG: 675 case IP_FW_NAT_GET_LOG: 676 if (V_ip_fw_ctl_ptr != NULL) 677 error = V_ip_fw_ctl_ptr(sopt); 678 else 679 error = ENOPROTOOPT; 680 break; 681 682 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 683 if (ip_dn_ctl_ptr != NULL) 684 error = ip_dn_ctl_ptr(sopt); 685 else 686 error = ENOPROTOOPT; 687 break ; 688 689 case MRT_INIT: 690 case MRT_DONE: 691 case MRT_ADD_VIF: 692 case MRT_DEL_VIF: 693 case MRT_ADD_MFC: 694 case MRT_DEL_MFC: 695 case MRT_VERSION: 696 case MRT_ASSERT: 697 case MRT_API_SUPPORT: 698 case MRT_API_CONFIG: 699 case MRT_ADD_BW_UPCALL: 700 case MRT_DEL_BW_UPCALL: 701 error = priv_check(curthread, PRIV_NETINET_MROUTE); 702 if (error != 0) 703 return (error); 704 if (inp->inp_ip_p != IPPROTO_IGMP) 705 return (EOPNOTSUPP); 706 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 707 EOPNOTSUPP; 708 break; 709 710 default: 711 error = ip_ctloutput(so, sopt); 712 break; 713 } 714 break; 715 716 case SOPT_SET: 717 switch (sopt->sopt_name) { 718 case IP_HDRINCL: 719 error = sooptcopyin(sopt, &optval, sizeof optval, 720 sizeof optval); 721 if (error) 722 break; 723 INP_WLOCK(inp); 724 if (optval) 725 inp->inp_flags |= INP_HDRINCL; 726 else 727 inp->inp_flags &= ~INP_HDRINCL; 728 INP_WUNLOCK(inp); 729 break; 730 731 case IP_FW3: /* generic ipfw v.3 functions */ 732 case IP_FW_ADD: 733 case IP_FW_DEL: 734 case IP_FW_FLUSH: 735 case IP_FW_ZERO: 736 case IP_FW_RESETLOG: 737 case IP_FW_TABLE_ADD: 738 case IP_FW_TABLE_DEL: 739 case IP_FW_TABLE_FLUSH: 740 case IP_FW_NAT_CFG: 741 case IP_FW_NAT_DEL: 742 if (V_ip_fw_ctl_ptr != NULL) 743 error = V_ip_fw_ctl_ptr(sopt); 744 else 745 error = ENOPROTOOPT; 746 break; 747 748 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 749 if (ip_dn_ctl_ptr != NULL) 750 error = ip_dn_ctl_ptr(sopt); 751 else 752 error = ENOPROTOOPT ; 753 break ; 754 755 case IP_RSVP_ON: 756 error = priv_check(curthread, PRIV_NETINET_MROUTE); 757 if (error != 0) 758 return (error); 759 if (inp->inp_ip_p != IPPROTO_RSVP) 760 return (EOPNOTSUPP); 761 error = ip_rsvp_init(so); 762 break; 763 764 case IP_RSVP_OFF: 765 error = priv_check(curthread, PRIV_NETINET_MROUTE); 766 if (error != 0) 767 return (error); 768 error = ip_rsvp_done(); 769 break; 770 771 case IP_RSVP_VIF_ON: 772 case IP_RSVP_VIF_OFF: 773 error = priv_check(curthread, PRIV_NETINET_MROUTE); 774 if (error != 0) 775 return (error); 776 if (inp->inp_ip_p != IPPROTO_RSVP) 777 return (EOPNOTSUPP); 778 error = ip_rsvp_vif ? 779 ip_rsvp_vif(so, sopt) : EINVAL; 780 break; 781 782 case MRT_INIT: 783 case MRT_DONE: 784 case MRT_ADD_VIF: 785 case MRT_DEL_VIF: 786 case MRT_ADD_MFC: 787 case MRT_DEL_MFC: 788 case MRT_VERSION: 789 case MRT_ASSERT: 790 case MRT_API_SUPPORT: 791 case MRT_API_CONFIG: 792 case MRT_ADD_BW_UPCALL: 793 case MRT_DEL_BW_UPCALL: 794 error = priv_check(curthread, PRIV_NETINET_MROUTE); 795 if (error != 0) 796 return (error); 797 if (inp->inp_ip_p != IPPROTO_IGMP) 798 return (EOPNOTSUPP); 799 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 800 EOPNOTSUPP; 801 break; 802 803 default: 804 error = ip_ctloutput(so, sopt); 805 break; 806 } 807 break; 808 } 809 810 return (error); 811 } 812 813 void 814 rip_ctlinput(struct icmp *icmp) 815 { 816 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 817 if (IPSEC_ENABLED(ipv4)) 818 IPSEC_CTLINPUT(ipv4, icmp); 819 #endif 820 } 821 822 static int 823 rip_attach(struct socket *so, int proto, struct thread *td) 824 { 825 struct inpcb *inp; 826 int error; 827 828 inp = sotoinpcb(so); 829 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 830 831 error = priv_check(td, PRIV_NETINET_RAW); 832 if (error) 833 return (error); 834 if (proto >= IPPROTO_MAX || proto < 0) 835 return EPROTONOSUPPORT; 836 error = soreserve(so, rip_sendspace, rip_recvspace); 837 if (error) 838 return (error); 839 error = in_pcballoc(so, &V_ripcbinfo); 840 if (error) 841 return (error); 842 inp = (struct inpcb *)so->so_pcb; 843 inp->inp_ip_p = proto; 844 inp->inp_ip_ttl = V_ip_defttl; 845 INP_HASH_WLOCK(&V_ripcbinfo); 846 rip_inshash(inp); 847 INP_HASH_WUNLOCK(&V_ripcbinfo); 848 INP_WUNLOCK(inp); 849 return (0); 850 } 851 852 static void 853 rip_detach(struct socket *so) 854 { 855 struct inpcb *inp; 856 857 inp = sotoinpcb(so); 858 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 859 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 860 ("rip_detach: not closed")); 861 862 /* Disable mrouter first */ 863 if (so == V_ip_mrouter && ip_mrouter_done) 864 ip_mrouter_done(); 865 866 INP_WLOCK(inp); 867 INP_HASH_WLOCK(&V_ripcbinfo); 868 rip_delhash(inp); 869 INP_HASH_WUNLOCK(&V_ripcbinfo); 870 871 if (ip_rsvp_force_done) 872 ip_rsvp_force_done(so); 873 if (so == V_ip_rsvpd) 874 ip_rsvp_done(); 875 in_pcbfree(inp); 876 } 877 878 static void 879 rip_dodisconnect(struct socket *so, struct inpcb *inp) 880 { 881 struct inpcbinfo *pcbinfo; 882 883 pcbinfo = inp->inp_pcbinfo; 884 INP_WLOCK(inp); 885 INP_HASH_WLOCK(pcbinfo); 886 rip_delhash(inp); 887 inp->inp_faddr.s_addr = INADDR_ANY; 888 rip_inshash(inp); 889 INP_HASH_WUNLOCK(pcbinfo); 890 SOCK_LOCK(so); 891 so->so_state &= ~SS_ISCONNECTED; 892 SOCK_UNLOCK(so); 893 INP_WUNLOCK(inp); 894 } 895 896 static void 897 rip_abort(struct socket *so) 898 { 899 struct inpcb *inp; 900 901 inp = sotoinpcb(so); 902 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 903 904 rip_dodisconnect(so, inp); 905 } 906 907 static void 908 rip_close(struct socket *so) 909 { 910 struct inpcb *inp; 911 912 inp = sotoinpcb(so); 913 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 914 915 rip_dodisconnect(so, inp); 916 } 917 918 static int 919 rip_disconnect(struct socket *so) 920 { 921 struct inpcb *inp; 922 923 if ((so->so_state & SS_ISCONNECTED) == 0) 924 return (ENOTCONN); 925 926 inp = sotoinpcb(so); 927 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 928 929 rip_dodisconnect(so, inp); 930 return (0); 931 } 932 933 static int 934 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 935 { 936 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 937 struct inpcb *inp; 938 int error; 939 940 if (nam->sa_family != AF_INET) 941 return (EAFNOSUPPORT); 942 if (nam->sa_len != sizeof(*addr)) 943 return (EINVAL); 944 945 error = prison_check_ip4(td->td_ucred, &addr->sin_addr); 946 if (error != 0) 947 return (error); 948 949 inp = sotoinpcb(so); 950 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 951 952 if (CK_STAILQ_EMPTY(&V_ifnet) || 953 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 954 (addr->sin_addr.s_addr && 955 (inp->inp_flags & INP_BINDANY) == 0 && 956 ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) 957 return (EADDRNOTAVAIL); 958 959 INP_WLOCK(inp); 960 INP_HASH_WLOCK(&V_ripcbinfo); 961 rip_delhash(inp); 962 inp->inp_laddr = addr->sin_addr; 963 rip_inshash(inp); 964 INP_HASH_WUNLOCK(&V_ripcbinfo); 965 INP_WUNLOCK(inp); 966 return (0); 967 } 968 969 static int 970 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 971 { 972 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 973 struct inpcb *inp; 974 975 if (nam->sa_len != sizeof(*addr)) 976 return (EINVAL); 977 if (CK_STAILQ_EMPTY(&V_ifnet)) 978 return (EADDRNOTAVAIL); 979 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 980 return (EAFNOSUPPORT); 981 982 inp = sotoinpcb(so); 983 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 984 985 INP_WLOCK(inp); 986 INP_HASH_WLOCK(&V_ripcbinfo); 987 rip_delhash(inp); 988 inp->inp_faddr = addr->sin_addr; 989 rip_inshash(inp); 990 INP_HASH_WUNLOCK(&V_ripcbinfo); 991 soisconnected(so); 992 INP_WUNLOCK(inp); 993 return (0); 994 } 995 996 static int 997 rip_shutdown(struct socket *so, enum shutdown_how how) 998 { 999 1000 SOCK_LOCK(so); 1001 if (!(so->so_state & SS_ISCONNECTED)) { 1002 SOCK_UNLOCK(so); 1003 return (ENOTCONN); 1004 } 1005 SOCK_UNLOCK(so); 1006 1007 switch (how) { 1008 case SHUT_RD: 1009 sorflush(so); 1010 break; 1011 case SHUT_RDWR: 1012 sorflush(so); 1013 /* FALLTHROUGH */ 1014 case SHUT_WR: 1015 socantsendmore(so); 1016 } 1017 1018 return (0); 1019 } 1020 #endif /* INET */ 1021 1022 static int 1023 rip_pcblist(SYSCTL_HANDLER_ARGS) 1024 { 1025 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo, 1026 INPLOOKUP_RLOCKPCB); 1027 struct xinpgen xig; 1028 struct inpcb *inp; 1029 int error; 1030 1031 if (req->newptr != 0) 1032 return (EPERM); 1033 1034 if (req->oldptr == 0) { 1035 int n; 1036 1037 n = V_ripcbinfo.ipi_count; 1038 n += imax(n / 8, 10); 1039 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 1040 return (0); 1041 } 1042 1043 if ((error = sysctl_wire_old_buffer(req, 0)) != 0) 1044 return (error); 1045 1046 bzero(&xig, sizeof(xig)); 1047 xig.xig_len = sizeof xig; 1048 xig.xig_count = V_ripcbinfo.ipi_count; 1049 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1050 xig.xig_sogen = so_gencnt; 1051 error = SYSCTL_OUT(req, &xig, sizeof xig); 1052 if (error) 1053 return (error); 1054 1055 while ((inp = inp_next(&inpi)) != NULL) { 1056 if (inp->inp_gencnt <= xig.xig_gen && 1057 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 1058 struct xinpcb xi; 1059 1060 in_pcbtoxinpcb(inp, &xi); 1061 error = SYSCTL_OUT(req, &xi, sizeof xi); 1062 if (error) { 1063 INP_RUNLOCK(inp); 1064 break; 1065 } 1066 } 1067 } 1068 1069 if (!error) { 1070 /* 1071 * Give the user an updated idea of our state. If the 1072 * generation differs from what we told her before, she knows 1073 * that something happened while we were processing this 1074 * request, and it might be necessary to retry. 1075 */ 1076 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1077 xig.xig_sogen = so_gencnt; 1078 xig.xig_count = V_ripcbinfo.ipi_count; 1079 error = SYSCTL_OUT(req, &xig, sizeof xig); 1080 } 1081 1082 return (error); 1083 } 1084 1085 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, 1086 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1087 rip_pcblist, "S,xinpcb", 1088 "List of active raw IP sockets"); 1089 1090 #ifdef INET 1091 struct protosw rip_protosw = { 1092 .pr_type = SOCK_RAW, 1093 .pr_flags = PR_ATOMIC|PR_ADDR, 1094 .pr_ctloutput = rip_ctloutput, 1095 .pr_abort = rip_abort, 1096 .pr_attach = rip_attach, 1097 .pr_bind = rip_bind, 1098 .pr_connect = rip_connect, 1099 .pr_control = in_control, 1100 .pr_detach = rip_detach, 1101 .pr_disconnect = rip_disconnect, 1102 .pr_peeraddr = in_getpeeraddr, 1103 .pr_send = rip_send, 1104 .pr_shutdown = rip_shutdown, 1105 .pr_sockaddr = in_getsockaddr, 1106 .pr_sosetlabel = in_pcbsosetlabel, 1107 .pr_close = rip_close 1108 }; 1109 #endif /* INET */ 1110